In [None]:
# https://medium.com/@yehjames/%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90-%E6%A9%9F%E5%99%A8%E5%AD%B8%E7%BF%92-%E7%AC%AC4-1%E8%AC%9B-kaggle%E7%AB%B6%E8%B3%BD-%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E7%94%9F%E5%AD%98%E9%A0%90%E6%B8%AC-%E5%89%8D16-%E6%8E%92%E5%90%8D-a8842fea7077
# https://chtseng.wordpress.com/2017/12/24/kaggle-titanic%E5%80%96%E5%AD%98%E9%A0%90%E6%B8%AC-1/

from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import RandomForestRegressor

import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.mode.chained_assignment = None

In [None]:
train = pd.read_csv("Titanic/train.csv")
test = pd.read_csv("Titanic/test.csv")
submit = pd.read_csv('Titanic/gender_submission.csv')

In [None]:
# Train與Test dataset皆為二維陣列，Train有12個欄位891 records，Test則有11個欄位418 records。
print(train.shape)
print(test.shape)

In [None]:
train.head(5)

### 有空值需要處理

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

### Combine Train and Test Data

In [None]:
data = train.append(test)
data.head(5)

In [None]:
print(data.shape)

In [None]:
data.reset_index(inplace=True, drop=True)
data.head(5)

In [None]:
sns.countplot(data['Survived'])

In [None]:
sns.countplot(data['Pclass'], hue=data['Survived'])

In [None]:
sns.countplot(data['Sex'], hue=data['Survived'])

In [None]:
sns.countplot(data['Embarked'], hue=data['Survived'])

### Feature Engineering

In [None]:
data['Name'].str.split(", ", expand=True).head(3)

In [None]:
data['Title0'] = data['Name'].str.split(", ", expand=True)[0]
data['Title0'].head(10)

In [None]:
data['Title1'] = data['Name'].str.split(", ", expand=True)[1]
data['Title1'].head(10)

In [None]:
data['Title1'] = data['Title1'].str.split(".", expand=True)[0]
data['Title1'].head(10)

In [None]:
data['Title1'].unique()

In [None]:
data['Title1'].unique().shape

In [None]:
# 第一个参数是指定index，第二个参数是指定column 
pd.crosstab(data['Title1'],data['Sex'])

In [None]:
pd.crosstab(data['Title1'],data['Survived'])

In [None]:
data.groupby(['Title1']).mean()

In [None]:
data.groupby(['Title1'])['Age'].mean()

In [None]:
data.groupby(['Title1','Pclass']).mean()

In [None]:
data['Title1'].shape

In [None]:
# 透過 pandas 的 value_counts() 方法可以統計相異值的個數。
pd.value_counts(data['Title1'])

In [None]:
# do not know the replace rule
data['Title2'] = data['Title1'] \
.replace(['Mlle','Mme','Ms','Dr','Major','Lady','the Countess','Jonkheer','Col','Rev','Capt','Sir','Don','Dona'],
         ['Miss','Mrs','Miss','Mr','Mr','Mrs','Mrs','Mr','Mr','Mr','Mr','Mr','Mr','Mrs'])

In [None]:
data['Title2'].unique()

In [None]:
data.groupby('Title2')['Age'].mean()

In [None]:
pd.crosstab(data['Title2'],data['Sex'])

In [None]:
pd.crosstab(data['Title2'],data['Survived'])

In [None]:
data.info()

In [None]:
data['Ticket_info'] = \
data['Ticket'].apply(lambda x : x.replace(".","").replace("/","").strip().split(' ')[0] if not x.isdigit() else 'X')

In [None]:
data['Ticket_info'].unique()

In [None]:
sns.countplot(data['Ticket_info'], hue=data['Survived'])

### Missing Value

In [None]:
data.info()

In [None]:
pd.value_counts(data['Embarked'])

In [None]:
data['Embarked'] = data['Embarked'].fillna('S')

In [None]:
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

In [None]:
data.info()

In [None]:
data['Cabin'].head(10)

In [None]:
data["Cabin"] = data['Cabin'].apply(lambda x : str(x)[0] if not pd.isnull(x) else 'NoCabin')

In [None]:
data['Cabin'].head(10)

In [None]:
data["Cabin"].unique()

In [None]:
sns.countplot(data['Cabin'], hue=data['Survived'])

In [None]:
data['Sex'] = data['Sex'].astype('category').cat.codes