# Import Packages / Load Dataset

In [893]:
%cd /content/drive/My Drive/Kaggle/titanic

/content/drive/My Drive/Kaggle/titanic


In [894]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re
import lightgbm
import xgboost
import os

In [895]:
df_train = pd.read_csv(os.getcwd()+'/train.csv')

# Data Exploration

In [896]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [897]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [898]:
df_train.isna().sum()
# Age, Cabin have lots of missing values.

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [899]:
df_train.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [900]:
df_train.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [902]:
# Cabin Types
df_train.Cabin.dropna().map(lambda x: x[0]).unique()

array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

# Preprocessing

In [903]:
df_clean = df_train.copy()
# Make Sex a binary attribute
df_clean.Sex = df_clean.Sex.apply(lambda x: (x=='male') * 1)
# Cabin: Keep the initial
df_clean.Cabin = df_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
# Title from Name
df_clean["Title"] = df_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
# Make rare titles "Other"
df_clean.Title = df_clean.Title.map(lambda x: "Other" if (df_clean.Title.value_counts()[x] < 10) else x)
# Impute Cabin simply by set "Unk" group
df_clean.Cabin = df_clean.Cabin.fillna(value="Unk")
# Impute Age by the median of same Pclass&Title
byPclassAndTitle = df_clean.groupby(["Pclass", "Title"]).agg(np.median)["Age"].reset_index()
df_clean = pd.concat(
    [
     df_clean[df_clean.Age.notna()], 
     pd.merge(df_clean[df_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_clean.columns]
    ],
    axis=0
)
# Embarked
df_clean.Embarked = df_clean.Embarked.fillna("unk")
df_clean.sort_values("PassengerId", inplace=True)

In [904]:
df_clean.Title.value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
Other      27
Name: Title, dtype: int64

In [905]:
df_clean.Embarked.value_counts()

S      644
C      168
Q       77
unk      2
Name: Embarked, dtype: int64

# Model

In [935]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [954]:
df_clean.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title'],
      dtype='object')

In [955]:
# kept_cols = ['Pclass', 'Sex', 'Age', 'SibSp',
#        'Parch', 'Fare', 'Cabin', 'Embarked', 'Title']

# cat_cols = ['Cabin', 'Embarked', 'Title']

kept_cols = ['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare' , 'Title']
cat_cols = ['Title']

num_cols = [i for i in kept_cols if i not in cat_cols]

In [956]:
X = pd.concat([df_clean[num_cols], pd.get_dummies(df_clean[cat_cols], drop_first=True)], axis=1)

In [957]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,3,1,22.0,1,0,7.2500,0,1,0,0
1,1,0,38.0,1,0,71.2833,0,0,1,0
2,3,0,26.0,0,0,7.9250,1,0,0,0
3,1,0,35.0,1,0,53.1000,0,0,1,0
4,3,1,35.0,0,0,8.0500,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,0,0,0,1
887,1,0,19.0,0,0,30.0000,1,0,0,0
176,3,0,18.0,1,2,23.4500,1,0,0,0
889,1,1,26.0,0,0,30.0000,0,1,0,0


In [958]:
y = df_clean['Survived']

In [1104]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
clf = lightgbm.LGBMClassifier(
    max_depth=5,
    min_child_weight=0.1,
    n_jobs=-1, num_leaves=15, 
)
# clf = xgboost.XGBClassifier()
clf.fit(X=X_train, y=y_train)
clf.score(X_val, y_val)

0.8715083798882681

# Do the same thing for our test data

In [945]:
df_test = pd.read_csv(os.getcwd()+'/test.csv')

In [1106]:
df_test_clean = df_test.copy()
# Make Sex a binary attribute
df_test_clean.Sex = df_test_clean.Sex.apply(lambda x: (x=='male') * 1)
# Cabin: Keep the initial
df_test_clean.Cabin = df_test_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
# Title from Name
df_test_clean["Title"] = df_test_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
# Make rare titles "Other"
df_test_clean.Title = df_test_clean.Title.map(lambda x: "Other" if x not in ["Mr", "Miss", "Mrs", "Master"] else x)
# Impute Cabin simply by set "Unk" group
df_test_clean.Cabin = df_test_clean.Cabin.fillna(value="Unk")

# Impute Age by the median of same Pclass&Title
# Use the result from train


df_test_clean = pd.concat(
    [
     df_test_clean[df_test_clean.Age.notna()], 
     pd.merge(df_test_clean[df_test_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_test_clean.columns]
    ],
    axis=0
)
# Embarked
df_test_clean.Embarked = df_test_clean.Embarked.fillna("unk")
df_test_clean.sort_values("PassengerId", inplace=True)

In [1107]:
X_test = pd.concat([df_test_clean[num_cols], pd.get_dummies(df_test_clean[cat_cols], drop_first=True)], axis=1)

In [1108]:
X_test = X_test.assign(Cabin_T = 0).assign(Embarked_unk=0)[X_train.columns]

In [1109]:
clf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [1110]:
y_test_truth = pd.read_csv(os.getcwd()+'/gender_submission.csv')

In [1111]:
clf.score(X_test, y_test_truth.Survived)

0.8827751196172249

# Export Prediction


In [1112]:
y_submission = y_test_truth.copy()
y_submission["Survived"] = clf.predict(X_test)
y_submission.set_index("PassengerId", inplace=True)
y_submission.to_csv(os.getcwd()+'/Submission.csv')

In [1118]:
!jupyter nbconvert --to markdown Titanic.ipynb

[NbConvertApp] Converting notebook Titanic.ipynb to markdown
[NbConvertApp] Writing 20516 bytes to Titanic.md
