In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
def process_titanic_train():
    
    df = pd.read_csv('titanic_train.csv')
    
    fill_dict = {}
    
    df.drop(columns=['PassengerId'], inplace=True)
    Pclass_fill = df['Pclass'].mode()[0]
    df['Pclass'].fillna(Pclass_fill, inplace=True)
    fill_dict['Pclass'] = Pclass_fill
    
    Name_fill = "no_name. is missing value"
    df['Name'].fillna(Name_fill, inplace=True)
    fill_dict['Name'] = Name_fill
    
    df['title'] = df['Name'].apply(lambda x: x.split('.')[0].split()[-1].strip().lower())
    common_titles = ['mr','mrs','miss','master']
    df['title'] = df['title'].apply(lambda x: x if x in common_titles else 'special_title')
    df.drop(columns=['Name'], inplace=True)
    
    def fill_sex(x):
        if pd.isnull(x['Sex']):
            if x['title'] in ['mrs','miss']:
                return 'female'
            else:
                return 'male'
        else:
            return x['Sex']
    
    df['Sex'] = df.apply(lambda x: fill_sex(x), axis=1)
    
    age_fill = df.groupby(['Pclass','Sex']).mean()['Age']
    df['Age'] = round(df.apply(lambda x: age_fill[x['Pclass']][x['Sex']] if pd.isnull(x['Age']) else x['Age'], axis=1), 2)
    fill_dict['Age'] = age_fill
    
    SibSp_fill = df['SibSp'].mode()[0]
    df['SibSp'].fillna(SibSp_fill, inplace=True)
    fill_dict['SibSp'] = SibSp_fill
    
    Parch_fill = df['Parch'].mode()[0]
    df['Parch'].fillna(Parch_fill, inplace=True)
    fill_dict['Parch'] = Parch_fill
    
    df['Ticket'].fillna("9", inplace=True)
    
    fare_fill = df.groupby(['Pclass','Sex']).mean()['Fare']
    df['Fare'] = df.apply(lambda x: fare_fill[x['Pclass']][x['Sex']] if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    fill_dict['Fare'] = fare_fill
    
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    embarked_fill = df['Embarked'].mode()[0]
    df['Embarked'].fillna(embarked_fill, inplace=True)
    fill_dict['Embarked'] = embarked_fill
    
    df['age_group'] = pd.cut(df['Age'], bins=[0,5,14,60,100], labels=['infant','child','adult','old'])
    df['family_size'] = df['SibSp'] + df['Parch']
    df['ticket_feat'] = df['Ticket'].apply(lambda x: 'other_num' if x[0] in ['4','5','6','7','8','9'] else x[0])
    df.drop(columns=['Ticket'], inplace=True)
    df['fare_tier'] = pd.cut(df['Fare'], bins=[0,50,1000], labels=['economical','expensive'], include_lowest=True)
    
    df['Fare'] = round(df['Fare'].apply(lambda x:np.log(x+1)), 2)
    
    sex_dummies = pd.get_dummies(df['Sex'], prefix='sex', drop_first=True)
    embarked_dummies = pd.get_dummies(df['Embarked'], prefix='embarked', drop_first=True)
    title_dummies = pd.get_dummies(df['title'], prefix='title', drop_first=True)
    age_group_dummies = pd.get_dummies(df['age_group'], prefix='age_group', drop_first=True)
    ticket_feat_dummies = pd.get_dummies(df['ticket_feat'], prefix='ticket', drop_first=True)
    fare_tier_dummies = pd.get_dummies(df['fare_tier'], prefix='fare', drop_first=True)
    
    df = pd.concat([df,sex_dummies,embarked_dummies,title_dummies,age_group_dummies,ticket_feat_dummies,fare_tier_dummies], axis=1)
    df.drop(columns=['Sex','Embarked','title','age_group','ticket_feat','fare_tier'], inplace=True)
    
    return df, fill_dict

In [5]:
df, fill_dict = process_titanic_train()

In [6]:
def process_titanic_test(fill_dict):
    
    df = pd.read_csv('titanic_test.csv')
    
    pass_id_array = df['PassengerId'].values
    df.drop(columns=['PassengerId'], inplace=True)
    
    for feat in ['Pclass','Name','SibSp','Parch','Embarked']:
        df[feat].fillna(fill_dict[feat], inplace=True)
    
    df['title'] = df['Name'].apply(lambda x: x.split('.')[0].split()[-1].strip().lower())
    common_titles = ['mr','mrs','miss','master']
    df['title'] = df['title'].apply(lambda x: x if x in common_titles else 'special_title')
    df.drop(columns=['Name'], inplace=True)
    
    def fill_sex(x):
        if pd.isnull(x['Sex']):
            if x['title'] in ['mrs','miss']:
                return 'female'
            else:
                return 'male'
        else:
            return x['Sex']
    
    df['Sex'] = df.apply(lambda x: fill_sex(x), axis=1)
    df['Age'] = round(df.apply(lambda x: fill_dict['Age'][x['Pclass']][x['Sex']] if pd.isnull(x['Age']) else x['Age'], axis=1), 2)
    df['Ticket'].fillna("9", inplace=True)
    df['Fare'] = df.apply(lambda x: fill_dict['Fare'][x['Pclass']][x['Sex']] if pd.isnull(x['Fare']) else x['Fare'], axis=1)
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    df['age_group'] = pd.cut(df['Age'], bins=[0,5,14,60,100], labels=['infant','child','adult','old'])
    df['family_size'] = df['SibSp'] + df['Parch']
    df['ticket_feat'] = df['Ticket'].apply(lambda x: 'other_num' if x[0] in ['4','5','6','7','8','9'] else x[0])
    df.drop(columns=['Ticket'], inplace=True)
    df['fare_tier'] = pd.cut(df['Fare'], bins=[0,50,1000], labels=['economical','expensive'], include_lowest=True)
    
    df['Fare'] = round(df['Fare'].apply(lambda x:np.log(x+1)), 2)
    
    sex_dummies = pd.get_dummies(df['Sex'], prefix='sex', drop_first=True)
    embarked_dummies = pd.get_dummies(df['Embarked'], prefix='embarked', drop_first=True)
    title_dummies = pd.get_dummies(df['title'], prefix='title', drop_first=True)
    age_group_dummies = pd.get_dummies(df['age_group'], prefix='age_group', drop_first=True)
    ticket_feat_dummies = pd.get_dummies(df['ticket_feat'], prefix='ticket', drop_first=True)
    fare_tier_dummies = pd.get_dummies(df['fare_tier'], prefix='fare', drop_first=True)
    
    df = pd.concat([df,sex_dummies,embarked_dummies,title_dummies,age_group_dummies,ticket_feat_dummies,fare_tier_dummies], axis=1)
    df.drop(columns=['Sex','Embarked','title','age_group','ticket_feat','fare_tier'], inplace=True)
    
    return df, pass_id_array

In [7]:
testdf, pass_id_array = process_titanic_test(fill_dict)

In [8]:
testdf.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin,family_size,sex_male,embarked_Q,embarked_S,...,ticket_3,ticket_A,ticket_C,ticket_F,ticket_L,ticket_P,ticket_S,ticket_W,ticket_other_num,fare_expensive
0,3,34.5,0,0,2.18,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
1,3,47.0,1,0,2.08,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
2,2,62.0,0,0,2.37,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,27.0,0,0,2.27,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
4,3,22.0,1,1,2.59,0,2,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [None]:
df.drop(columns=['Survived']).head()

In [9]:
df.isnull().any().sum()

0

In [None]:
sns.heatmap(df.corr())

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
algos = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), SVC()]

In [22]:
for algo in algos:
    model = make_pipeline(StandardScaler(), algo)
    score = cross_val_score(model, df.drop(columns = ['Survived']), df['Survived'])
    print("{}:\n{}\n\n".format(str(algo).split('(')[0], score))

LogisticRegression:
[ 0.81481481  0.81481481  0.82828283]


DecisionTreeClassifier:
[ 0.77441077  0.79461279  0.8013468 ]


RandomForestClassifier:
[ 0.79124579  0.83501684  0.81144781]


SVC:
[ 0.82154882  0.81144781  0.83501684]




In [18]:
for algo in algos:
    model = make_pipeline(algo)
    score = cross_val_score(model, df.drop(columns = ['Survived','Age','Fare']), df['Survived'])
    print("{}:\n{}\n\n".format(str(algo).split('(')[0], score))

LogisticRegression:
[ 0.79124579  0.83164983  0.81818182]


DecisionTreeClassifier:
[ 0.77104377  0.81818182  0.80808081]


RandomForestClassifier:
[ 0.76767677  0.8013468   0.82154882]


SVC:
[ 0.81481481  0.84511785  0.83501684]




In [30]:
cross_val_score(RandomForestClassifier(n_estimators=20), df.drop(columns = ['Survived']), df['Survived'])

array([ 0.76767677,  0.83501684,  0.82491582])

In [31]:
cross_val_score(SVC(), df.drop(columns = ['Survived']), df['Survived'])

array([ 0.79461279,  0.82154882,  0.81481481])