In [None]:
# -----------------------------------------------
# タイタニックデータビューイング
# -----------------------------------------------

In [None]:
# -------------------------------------------------------------------
# ライブラリの読込
# -------------------------------------------------------------------
import glob
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
from pandas import Series, DataFrame

import scipy as sp
import seaborn as sns
sns.set()

import sklearn
import unicodedata

%matplotlib inline
%precision 3

In [None]:
plt.rcParams['font.family'] = 'IPAexGothic'

In [None]:
print(mpl.get_configdir())
print(mpl.matplotlib_fname())

In [None]:
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 300)

In [None]:
pd.options.display.float_format = '{:.6f}'.format

In [None]:
# -----------------------------------------------
# タイタニックデータ 読み込み
# -----------------------------------------------

In [None]:
raw_path = '../data/raw/'

In [None]:
# -------------------------------------------------------------------
# 学習データを読み込み
# -------------------------------------------------------------------
train = pd.read_csv(raw_path + 'train.csv')
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.head()

In [None]:
# --------------------------------------------------------------------------------------
# 学習データ情報
# （欠損なし） PassengerId, Survived, Pclass, Name, Sex, SibSp, Parch, Ticket, Fare 
# （欠損あり） Age,Cabin,Embarked  
# --------------------------------------------------------------------------------------

In [None]:
# --------------------------------------------------------------------------------------
# 単独データの可視化
# --------------------------------------------------------------------------------------

In [None]:
survived = train['Survived'].value_counts().rename({0:'死亡', 1:'生存'})
print(survived)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('生存者・死亡者数')
plt.bar(survived.index, survived)


In [None]:
sex = train['Sex'].value_counts().rename({'male':'男性', 'female':'女性'})
print(sex)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('性別毎の乗客数')
plt.bar(sex.index, sex)


In [None]:
pclass = train['Pclass'].value_counts().sort_index().rename({3:'三等', 2:'二等', 1:'一等'})
print(pclass)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('客室等級毎の乗客数')
plt.bar(pclass.index, pclass)

In [None]:
age = train['Age'].value_counts().sort_index()
# print(age)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('乗客の年齢分布')
plt.hist(train['Age'])

In [None]:
# なぜか年齢が小数点の乗客が存在する

In [None]:
plt.figure(figsize=(12, 3))
plt.title('年齢毎の乗客数')
plt.bar(age.index, age)

In [None]:
plt.figure(figsize=(8, 3))
plt.title('乗客の運賃分布')
plt.hist(train['Fare'], bins=20)

In [None]:
# 料金が0の謎の乗客が15名いる
train.loc[train['Fare']==0]

In [None]:
plt.figure(figsize=(5, 3))
train['Fare_log10'] = np.log10(train['Fare']+1)  # 0の客がいるので+1して対数を取る
plt.hist(train['Fare_log10'], bins=20)

In [None]:
embarked = train['Embarked'].value_counts().rename({'C':'Cherbourg', 'Q':'Queenstown', 'S':'Southamptom'})
print(embarked)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('乗客の出港地')
plt.bar(embarked.index, embarked)

In [None]:
sibsp = train['SibSp'].value_counts().sort_index()
print(sibsp)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('乗客の兄弟・配偶者数')
plt.bar(sibsp.index, sibsp)

In [None]:
parch = train['Parch'].value_counts().sort_index()
print(parch)

In [None]:
plt.figure(figsize=(5, 3))
plt.title('乗客の親・子供の数')
plt.bar(parch.index, parch)

In [None]:
# --------------------------------------------------------------------------------------
# 複数データの可視化
# --------------------------------------------------------------------------------------

In [None]:
# --------------------------------------------------------------------------------------
# 生死と各種変数の関係
# --------------------------------------------------------------------------------------

In [None]:
plt.figure(figsize=(5, 3))
plt.title('性別と生死')
sns.countplot(data=train, x='Sex', hue='Survived')
plt.show()

In [None]:
plt.figure(figsize=(5, 3))
plt.title('乗船地と生死')
sns.countplot(data=train, x='Embarked', hue='Survived')
plt.show()

In [None]:
# 乗船値と生存に関係が見える。Cherbourgで乗船した人は生存率が高い
# 因果関係があるとは思えないんので、男女比や船室の投球に依存関係があると思われる 
# Southamptonで乗船した乗客は低収入の人が多い?

In [None]:
cherbourg = train.loc[train['Embarked']=='C']
cherbourg['Pclass'].value_counts().sort_index()
# やはりCherbourgから乗船した客は1等の客が多い

In [None]:
southampton = train.loc[train['Embarked']=='S']
southampton['Pclass'].value_counts().sort_index()
# Southamptomは3等の乗客数が多い

In [None]:
queensland = train.loc[train['Embarked']=='Q']
queensland['Pclass'].value_counts().sort_index()
# Queenslandも3等の乗船客が多い

In [None]:
# 年齢と生存の関係

In [None]:
plt.figure(figsize=(8, 3))
plt.title('生存者死亡者の年齢分布')
sns.histplot(data=train, x='Age', hue='Survived')
plt.show()

`10歳以下の生存率は高い`

In [None]:
plt.figure(figsize=(8, 3))
plt.title('生存者死亡者の運賃(log10)分布')
sns.histplot(data=train, x='Fare_log10', hue='Survived')
plt.show()
# 運賃が安いほど死亡率が高い

`運賃が安いほど死亡率が高い`

In [None]:
plt.figure(figsize=(5, 8))
plt.title('兄弟・配偶者の数と生死')
sns.countplot(data=train, x='SibSp', hue='Survived')
plt.show()

In [None]:
plt.figure(figsize=(5, 8))
plt.title('親・子供の数と生死')
sns.countplot(data=train, x='Parch', hue='Survived')
plt.show()
# 子供がいるほど生き残りやすい

In [None]:
# 性別を限定してグラフ化

In [None]:
women = train.loc[train['Sex']=='female']
men = train.loc[train['Sex']=='male']

In [None]:
plt.figure(figsize=(5, 3))
plt.title('親・子供の数と生死(女性)')
sns.countplot(data=women, x='Parch', hue='Survived')
plt.show()

In [None]:
plt.figure(figsize=(5, 3))
plt.title('親・子供の数と生死(男性)')
sns.countplot(data=men, x='Parch', hue='Survived')
plt.show()

In [None]:
# -------------------------------------------------------------------
# 特徴量生成
# -------------------------------------------------------------------

`同乗者数である'Fellow'を生成`

In [None]:
# 同乗者(Fellow)を生成
train['Fellow'] = train['SibSp'] + train['Parch']

In [None]:
plt.figure(figsize=(5, 8))
plt.title('同乗者数と生死')
sns.countplot(data=train, x='Fellow', hue='Survived')
plt.show()
# 同乗者数が1〜3だと生存率が高い

`同乗者数が1〜3人だと生存率が高い`

In [None]:
# 同乗者有無フラグを生成
train['HasFellow'] = 0
train.loc[train['Fellow'] > 0, 'HasFellow'] = 1

In [None]:
# 子供大人フラグ
train['Adult'] = 0
train.loc[train['Age'] > 10, 'Adult'] = 1

In [None]:
# ------------------------------------------------------------------------
# 欠損値補完を行う
# ------------------------------------------------------------------------

In [None]:
# 乗船地はよくわからないので、最頻値で補完
print(train['Embarked'].mode())
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

In [None]:
# 女性客の平均年齢
women_age_mean = train.loc[train['Sex']=='female']['Age'].dropna().mean()
print(women_age_mean)

In [None]:
# 女性客の平均年齢で女性客の欠損値を補完
train.loc[(train['Sex']=='female') & (train['Age'].isnull()), 'Age'] = train.loc[train['Sex']=='female']['Age'].dropna().mean()

In [None]:
# 男性客の平均年齢
men_age_mean = train.loc[train['Sex']=='male']['Age'].dropna().mean()
print(men_age_mean)

In [None]:
# 男性客の平均年齢で男性客の欠損値を補完
train.loc[(train['Sex']=='male') & (train['Age'].isnull()), 'Age'] = train.loc[train['Sex']=='male']['Age'].dropna().mean()

In [None]:
# ------------------------------------------------------------------------
# ここからカテゴリ変数をエンコーディングを行う
# ------------------------------------------------------------------------

In [None]:
import category_encoders as ce

In [None]:
ce_ore = ce.OrdinalEncoder(cols=['Embarked', 'Sex'])
train = ce_ore.fit_transform(train)
train

In [None]:
# 相関行列を作成
train_corr = train[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fellow', 'Fare', 'Fare_log10', 'Embarked', 'HasFellow', 'Adult']].corr()
train_corr

In [None]:
plt.figure(figsize=(10, 8))
plt.title('各変数の相関')
sns.heatmap(data=train_corr, vmax=1.0, vmin=-1.0, annot=True)

In [None]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix(train['Sex'].values, train['Survived'].values)

In [None]:
# -------------------------------------------------------------------
# 評価データを読み込み
# -------------------------------------------------------------------
test = pd.read_csv(raw_path + 'test.csv')
test.info()

In [None]:
test.isnull().sum()

In [None]:
# --------------------------------------------------------------------------------------
# 評価データ情報
# （欠損なし） PassengerId, Pclass, Name, Sex, SibSp, Parch, Ticket, Embarked
# （欠損あり） Age, Fare, Cabin
# --------------------------------------------------------------------------------------