In [3]:
!python -V
!tree ..
!pwd

Python 3.7.12
[01;34m..[00m
├── [01;34minput[00m
│   └── [01;34mtitanic[00m
│       ├── gender_submission.csv
│       ├── test.csv
│       └── train.csv
├── [01;34mlib[00m
│   └── [01;34mkaggle[00m
│       └── gcp.py
└── [01;34mworking[00m
    └── __notebook_source__.ipynb

5 directories, 5 files
/kaggle/working


In [4]:
# パッケージ読み込み
import numpy as np
import pandas as pd

In [7]:
## データ読み込み

# 機械学習の学習用データ. タイタニック号の乗客の性別や年齢などの情報と,生存したかの情報が格納されている.
train = pd.read_csv('../input/titanic/train.csv')
# 予測を実施するデータ. 乗客の属性のみが格納されているので学習データを元に生存可否の予測値を算出する.
test = pd.read_csv('../input/titanic/test.csv')
# 提出用のcsvファイルサンプル. 仮に女性のみが生存というデータが格納されている.
gender_submission = pd.read_csv('../input/titanic/gender_submission.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [22]:
# 文字列を機械学習アルゴリズムが扱える数値の形式にソースを変換する.
# またNaN(欠損値)は平均値や中央値などの代表的な値で穴埋めする場合もある.

## 特徴量エンジニアリングとは
# - 読み込んだデータを機械学習アルゴリズムが扱える形に変換する
# - 既存のデータから機械学習アルゴリズムが予測する上で有用な新しい特徴量を作成する。

# trainとtestデータを縦方向に連結. (pros. trainとtestの共通処理を1度にできる. testの情報も考慮した標準化等の処理を実行できる)
data = pd.concat([train, test], sort=False)
print('train:',len(train),', test:',len(test), ', data:',len(data))

# 性別を文字列から0,1に変換
data['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
data.head()



train: 891 , test: 418 , data: 1309


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [28]:
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [30]:
# 欠損値の穴埋め. Fareを平均値で補完
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [37]:
# 年齢
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1.0,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,0
4,5,0.0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,0


In [40]:
## 必要なデータのみ取り出し
# 不要カラム削除
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0.0,3,0,22.0,7.25,0
1,1.0,1,1,38.0,71.2833,1
2,1.0,3,1,26.0,7.925,0
3,1.0,1,1,35.0,53.1,0
4,0.0,3,0,35.0,8.05,0


In [41]:
# dataからtrainとtestに分ける
train = data[:len(train)]
test = data[len(train):]

In [42]:
# 機械学習用のデータ取り出し
y_train = train['Survived']
X_train = train.drop('Survived', axis=1)
X_test = test.drop('Survived', axis=1)

In [45]:
y_train.head()

0    0.0
1    1.0
2    1.0
3    1.0
4    0.0
Name: Survived, dtype: float64

In [46]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,0,22.0,7.25,0
1,1,1,38.0,71.2833,1
2,3,1,26.0,7.925,0
3,1,1,35.0,53.1,0
4,3,0,35.0,8.05,0


In [48]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
0,3,0,34.5,7.8292,2
1,3,1,47.0,7.0,0
2,2,0,62.0,9.6875,2
3,3,0,27.0,8.6625,0
4,3,1,22.0,12.2875,0


In [49]:
# 用意した特徴量と予測のペアから機械学習アルゴリズムを用いて学習させる
# ロジスティック回帰を使う
from sklearn.linear_model import LogisticRegression

# max_iterを十分大きい値にしないと右エラーになることがある (onvergenceWarning: The max_iter was reached which means the coef_ did not converge)
clf = LogisticRegression(penalty='l2', solver='sag', random_state=0, max_iter=10000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=0, solver='sag')

In [51]:
# 学習を終えるとtestを与えて予測をさせることができる
y_pred = clf.predict(X_test)
y_pred[:20]

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1.])

In [52]:
# 提出用ファイル作成
sub = pd.read_csv('../input/titanic/gender_submission.csv')
sub['Survived'] = list(map(int, y_pred))
sub.to_csv('submission.csv', index=False)

In [53]:
!head submission.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1
