# データ 前処理

In [1]:
import pandas as pd
import numpy as np
 
# train data
train = pd.read_csv('./data/train.csv')
# org create
train_org = train.copy()

# test data
test = pd.read_csv('./data/test.csv')
# org create
test_org = test.copy()

# gender data
gendersb = pd.read_csv('./data/gender_submission.csv')
# org create
gendersb_org = gendersb.copy()

# Cabin data create
# 番号部分を取り出す(T は誤りとして処理)
def extract_cabin_type(x):
  cabin = x['Cabin']
  if isinstance(cabin, str) and cabin[0] != 'T':
    return cabin[0]
  else:
    return np.nan

# 訓練データとテストデータの両方に、チケット番号を取り出したCabinType列を追加
train['CabinType'] = train.apply(extract_cabin_type, axis=1)
test['CabinType'] = test.apply(extract_cabin_type, axis=1)

sortedCabinTypes = sorted(set(train['CabinType'].dropna()))

# PersonType data create
# 子供、男性、女性に分類
def male_female_child(x):
  age = x['Age']
  sex = x['Sex']
  # 15歳以下なら子供、それ以外を male / female に分ける
  if age <= 15:
    return 'child'
  else:
    return sex

# 訓練データとテストデータの両方に、male / female / childに分けたPersonType列を追加
train['PersonType'] = train.apply(male_female_child,axis=1)
test['PersonType'] = test.apply(male_female_child,axis=1)

In [2]:
print(train["Sex"].unique())
print(test["Sex"].unique())

print(train["Embarked"].unique())
print(test["Embarked"].unique())

print(train["PersonType"].unique())
print(test["PersonType"].unique())

print(train["CabinType"].unique())
print(test["CabinType"].unique())

['male' 'female']
['male' 'female']
['S' 'C' 'Q' nan]
['Q' 'S' 'C']
['male' 'female' 'child']
['male' 'female' 'child']
[nan 'C' 'E' 'G' 'D' 'A' 'B' 'F']
[nan 'B' 'E' 'A' 'C' 'D' 'F' 'G']


In [3]:
#データ整形
train["Embarked"] = train["Embarked"].replace("C", 0).replace("Q", 1).replace("S", 2)
test["Embarked"] = test["Embarked"].replace("C", 0).replace("Q", 1).replace("S", 2)

print(train["Embarked"].unique())
print(test["Embarked"].unique())

train["CabinType"] = train["CabinType"].replace("A", 0).replace("B", 1).replace("C", 2).replace("D", 3).replace("E", 4).replace("F", 5).replace("G", 6)
test["CabinType"] = test["CabinType"].replace("A", 0).replace("B", 1).replace("C", 2).replace("D", 3).replace("E", 4).replace("F", 5).replace("G", 6)

print(train["CabinType"].unique())
print(test["CabinType"].unique())

# Sex:(male,female)=(0,1)
train["Sex"] = train["Sex"].replace("male", 0).replace("female", 1)
test["Sex"] = test["Sex"].replace("male", 0).replace("female", 1)

print(train["Sex"].unique())
print(test["Sex"].unique())

# PersonType:(male,female,child)=(0,1,2)
train["PersonType"] = train["PersonType"].replace("male", 0).replace("female", 1).replace("child", 2)
test["PersonType"] = test["PersonType"].replace("male", 0).replace("female", 1).replace("child", 2)

print(train["PersonType"].unique())
print(test["PersonType"].unique())

[ 2.  0.  1. nan]
[1 2 0]
[nan  2.  4.  6.  3.  0.  1.  5.]
[nan  1.  4.  0.  2.  3.  5.  6.]
[0 1]
[0 1]
[0 1 2]
[0 1 2]


In [4]:
print('訓練データの欠損値の個数\n', train.isnull().sum())
print('-' * 40)
print('テストデータの欠損値の個数\n', test.isnull().sum())

訓練データの欠損値の個数
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
CabinType      688
PersonType       0
dtype: int64
----------------------------------------
テストデータの欠損値の個数
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
CabinType      327
PersonType       0
dtype: int64


In [5]:
#Train 欠損値変更
train["Embarked"].value_counts()

2.0    644
0.0    168
1.0     77
Name: Embarked, dtype: int64

In [6]:
#高頻度で補完
train["Embarked"] = train["Embarked"].fillna(2)
print(train["Embarked"].unique())

[2. 0. 1.]


In [7]:
#欠損値で単純補完
train["CabinType"] = train["CabinType"].fillna(-1)
test["CabinType"] = test["CabinType"].fillna(-1)

print(train["CabinType"].unique())
print(test["CabinType"].unique())

train["Cabin"] = train["Cabin"].fillna(-1)
test["Cabin"] = test["Cabin"].fillna(-1)

print(train["Cabin"].unique())
print(test["Cabin"].unique())

[-1.  2.  4.  6.  3.  0.  1.  5.]
[-1.  1.  4.  0.  2.  3.  5.  6.]
[-1 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A2

In [8]:
#中央値補完
age_mean = pd.concat([train["Age"], test["Age"]]).mean()
fare_mean = pd.concat([train["Fare"], test["Fare"]]).mean()

print(age_mean, fare_mean)

train["Age"] = train["Age"].fillna(age_mean)
test["Age"] = test["Age"].fillna(age_mean)

train["Fare"] = train["Fare"].fillna(fare_mean)
test["Fare"] = test["Fare"].fillna(fare_mean)

29.881137667304014 33.29547928134557


In [9]:
print('訓練データの欠損値の個数\n', train.isnull().sum())
print('-' * 40)
print('テストデータの欠損値の個数\n', test.isnull().sum())

訓練データの欠損値の個数
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
CabinType      0
PersonType     0
dtype: int64
----------------------------------------
テストデータの欠損値の個数
 PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
CabinType      0
PersonType     0
dtype: int64


In [10]:
#ランダムフォレスト
# ラベル
y_train = train["Survived"].values
# 特徴量（4個のみ採用）
X_train = train[["Pclass", "Sex", "Age", "Fare"]].values
X_test = test[["Pclass", "Sex", "Age", "Fare"]].values
 
# ランダムフォレストによる学習
from sklearn import ensemble
model = ensemble.RandomForestClassifier(random_state=0, n_estimators=10)
model.fit(X_train, y_train)

# 訓練データで予測を試す
y_pred = model.predict(X_train)

# 正解率(accuracy)を表示
print("訓練データでの正解率:", model.score(X_train, y_train))
  
# テストデータで予測
y_test = model.predict(X_test)

# テストデータの予測結果をCSVに出力
result = pd.DataFrame(y_test, test["PassengerId"], columns = ["Survived"])
result.to_csv("result.csv", index_label = ["PassengerId"])

訓練データでの正解率: 0.9595959595959596


In [11]:
#Google Colabでやった場合
#from google.colab import files
#files.download("result.csv")