In [1]:
import csv
import numpy as np
with open('titanic.csv','r') as csvfile:
    titanic_reader = csv.reader(csvfile,delimiter=',',quotechar='"')

    #特徴量の名前が書かれたHeaderを読み取る
    row = next(titanic_reader)
    feature_names = np.array(row)

    #データと正解ラベルを読み取る
    titanic_x, titanic_y = [],[]
    for row in titanic_reader:
        titanic_x.append(row)
        titanic_y.append(row[2]) #正解ラベルは3列目の"survived"

    titanic_x = np.array(titanic_x) #型をリストからnumpy.ndarrayにする
    titanic_y = np.array(titanic_y) #型をリストからnumpy.ndarrayにする

print(feature_names)
print(titanic_x[0],titanic_y[0])

# class(1),age(4),sex(10)を残す
titanic_x = titanic_x[:,[1, 4, 10]]
feature_names = feature_names[[1, 4, 10]]

print(feature_names)
print(titanic_x[12],titanic_y[12])

['row.names' 'pclass' 'survived' 'name' 'age' 'embarked' 'home.dest'
 'room' 'ticket' 'boat' 'sex']
['1' '1st' '1' 'Allen, Miss Elisabeth Walton' '29.0000' 'Southampton'
 'St Louis, MO' 'B-5' '24160 L221' '2' 'female'] 1
['pclass' 'age' 'sex']
['1st' 'NA' 'female'] 1


In [24]:
import pandas as pd
from io import StringIO

# サンプルデータの作成
data = """A,B,C,D
1.0,2.0,3.0,
10.0,20.0,,30.0"""
df = pd.read_csv(StringIO(data))
print(df)

      A     B    C     D
0   1.0   2.0  3.0   NaN
1  10.0  20.0  NaN  30.0


In [25]:
# 各特徴量の欠損値をカウント
print(df.isnull().sum())

A    0
B    0
C    1
D    1
dtype: int64


In [26]:
# 欠測値を含む行を削除
df.dropna()
# 同様に、axis引数を1に設定することで、NaNを含んでいる行が1つでもある列を削除することが出来ます。

Unnamed: 0,A,B,C,D


In [27]:
# 欠測値を含む列を削除
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,10.0,20.0


In [28]:
# 全ての列がNaNの行のみを削除
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,10.0,20.0,,30.0


In [29]:
# 非NaN値が5つ未満の行のみを削除
df.dropna(thresh=5)

Unnamed: 0,A,B,C,D


In [30]:
# 特定の列（ここでは"A")にNaNが含まれている行のみを削除
df.dropna(subset=["A"])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,10.0,20.0,,30.0


# 補間法

In [31]:
from sklearn.impute import SimpleImputer
# 平均値で欠損値を補完するためのインスタンスを作成
imp = SimpleImputer(strategy = 'mean')
# 欠損値を補完
imp.fit(df)
imp.transform(df)

array([[ 1.,  2.,  3., 30.],
       [10., 20.,  3., 30.]])

In [32]:
df.head()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,10.0,20.0,,30.0


In [33]:
# 年齢の欠損値を平均値で埋める
ages = titanic_x[:,1]

In [34]:
# NA以外のageの平均値を計算する
mean_age = np.mean(titanic_x[ages != 'NA',1].astype(float))

In [35]:
# ageがNAのものを平均値に置き換える
titanic_x[titanic_x[:, 1] == 'NA',1] = mean_age

# エンコーディング

In [36]:
import pandas as pd
df = pd.DataFrame()

In [37]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_x[:, 2])
print('Categorical classes:',label_encoder.classes_)

integer_classes = label_encoder.transform(label_encoder.classes_)
print('Integer classes:',integer_classes)

t = label_encoder.transform(titanic_x[:, 2])
titanic_x[:,2] = t

print(feature_names)
print(titanic_x[12],titanic_y[12])

Categorical classes: ['female' 'male']
Integer classes: [0 1]
['pclass' 'age' 'sex']
['1st' '31.19418104265403' '0'] 1


In [40]:
from sklearn.preprocessing import OneHotEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_x [:, 0])
print("Categorical classes:", label_encoder.classes_)
integer_classes = label_encoder.transform(label_encoder.classes_).reshape(3, 1)
print("Integer classes:", integer_classes)
enc = OneHotEncoder()
one_hot_encoder = enc.fit(integer_classes)

#最初に、Label Encoderを使ってpclassを0-2に直す
num_of_rows = titanic_x.shape[0]
t = label_encoder.transform(titanic_x[:, 0]).reshape(num_of_rows, 1)
#次に、OneHotEncoderを使ってデータを1, 0に変換
new_features = one_hot_encoder.transform(t)
#1,0になおしてデータを統合する
titanic_x = np.concatenate([titanic_x, new_features.toarray()], axis = 1)
#OnehotEncoderをする前のpclassのデータを削除する
titanic_x = np.delete(titanic_x, [0], 1)
#特徴量の名前を更新する
feature_names = ['age', 'sex', 'first class', 'second class', 'third class']

# Convert to numerical values
titanic_x = titanic_x.astype (float)
titanic_y = titanic_y.astype (float)

print(feature_names)
print(titanic_x[0],titanic_y[0])

Categorical classes: [ 0.1667      0.3333      0.8333      0.9167      1.          2.
  3.          4.          5.          6.          7.          8.
  9.         10.         11.         12.         13.         14.
 15.         16.         17.         18.         19.         20.
 21.         22.         23.         24.         25.         26.
 27.         28.         29.         30.         31.         31.19418104
 32.         33.         34.         35.         36.         37.
 38.         39.         40.         41.         42.         43.
 44.         45.         46.         47.         48.         49.
 50.         51.         52.         53.         54.         55.
 56.         57.         58.         59.         60.         61.
 62.         63.         64.         65.         67.         69.
 70.         71.        ]


ValueError: cannot reshape array of size 74 into shape (3,1)

# 特徴量のスケーリング

# 標準化

In [45]:
import numpy as np
def zscore(x, axis = None):
    x_mean = x.mean(axis=axis, keepdims=True)
    x_std  = np.std(x, axis=axis, keepdims=True)
    z_score = (x-x_mean)/x_std
    return z_score

a = np.random.randint(10, size=(2,5))
print(a) # 実行結果は毎回異なります。

b = zscore(a)
print(b)


[[8 6 2 8 2]
 [1 3 4 2 2]]
[[ 1.69222822  0.88640526 -0.72524067  1.69222822 -0.72524067]
 [-1.12815215 -0.32232919  0.0805823  -0.72524067 -0.72524067]]


# 正規化

In [46]:
import numpy as np
def min_max(x, axis=None):
    x_min = x.min(axis=axis, keepdims=True)
    x_max = x.max(axis=axis, keepdims=True)
    result = (x - x_min) / (x_max - x_min)
    return result

a = np.random.randint(10, size=(2,5))
print(a)

b = min_max(a)
print(b)

[[2 8 0 9 5]
 [1 1 0 5 1]]
[[0.22222222 0.88888889 0.         1.         0.55555556]
 [0.11111111 0.11111111 0.         0.55555556 0.11111111]]
