In [1]:
import zipfile
with zipfile.ZipFile('./titanic.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras
from keras import optimizers, losses
from keras.layers import Dense, Dropout, Input

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_X = pd.read_csv('test.csv')
test_X.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
len(test_X)

418

In [5]:
submission_df = pd.read_csv('gender_submission.csv')
submission_df.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Preprocessing

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
for column in train_df.columns:
    print(column,len(train_df[column].unique()))

PassengerId 891
Survived 2
Pclass 3
Name 891
Sex 2
Age 89
SibSp 7
Parch 7
Ticket 681
Fare 248
Cabin 148
Embarked 4


In [8]:
train_X = train_df.drop('Survived', axis=1)
train_Y = train_df['Survived']

In [9]:
train_size = len(train_X)

In [10]:
merged_df = pd.concat([train_X, test_X])
merged_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [11]:
def preprocessing_df(df):
    df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
    
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    df['Cabin'].fillna('N/A', inplace=True)

    df['Age'].fillna(df['Age'].mean(), inplace=True)
    
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    return df
    

In [12]:
merged_df = preprocessing_df(merged_df)

In [13]:
merged_df.info()
merged_df.head(4)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Sex       1309 non-null   object 
 2   Age       1309 non-null   float64
 3   SibSp     1309 non-null   int64  
 4   Parch     1309 non-null   int64  
 5   Fare      1309 non-null   float64
 6   Cabin     1309 non-null   object 
 7   Embarked  1309 non-null   object 
dtypes: float64(2), int64(3), object(3)
memory usage: 92.0+ KB


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C123,S


In [14]:
def EDA(df):
    #df['Cabin'] = df['Cabin'].apply(lambda cabin : 'cabin' if cabin=='N/A' else 'not cabin')
    df.drop(['Cabin'], axis=1, inplace=True)
    df['Family'] = df.apply(lambda row : 'single' if row['SibSp'] + row['Parch'] == 0 
                            else ('medium' if row['SibSp'] + row['Parch'] < 4 else 'large'), axis=1)
    
#     def trans_age(x):
#         if(x<1): x = int(str(x).split('.')[1])
#         if(x<20): return 'young'
#         if(x<60): return 'adult'
#         return 'old'

#     df['Age'] = df['Age'].apply(trans_age)
#     df['Fare'] = df['Fare'].apply(lambda x : 'low' if x<df['Fare'].quantile(0.25) else('medium' if x<df['Fare'].quantile(0.5) 
#                                                                                    else ('high' if x<df['Fare'].quantile(0.75) 
#                                                                                          else 'very high')))
    
    df = df.drop(['SibSp', 'Parch'], axis=1)
    
    return df

In [15]:
merged_df = EDA(merged_df)

In [16]:
merged_df

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Family
0,3,male,22.000000,7.2500,S,medium
1,1,female,38.000000,71.2833,C,medium
2,3,female,26.000000,7.9250,S,single
3,1,female,35.000000,53.1000,S,medium
4,3,male,35.000000,8.0500,S,single
...,...,...,...,...,...,...
413,3,male,29.881138,8.0500,S,single
414,1,female,39.000000,108.9000,C,single
415,3,male,38.500000,7.2500,S,single
416,3,male,29.881138,8.0500,S,single


In [17]:
merged_df = pd.get_dummies(merged_df)
merged_df

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Family_large,Family_medium,Family_single
0,3,22.000000,7.2500,0,1,0,0,1,0,1,0
1,1,38.000000,71.2833,1,0,1,0,0,0,1,0
2,3,26.000000,7.9250,1,0,0,0,1,0,0,1
3,1,35.000000,53.1000,1,0,0,0,1,0,1,0
4,3,35.000000,8.0500,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
413,3,29.881138,8.0500,0,1,0,0,1,0,0,1
414,1,39.000000,108.9000,1,0,1,0,0,0,0,1
415,3,38.500000,7.2500,0,1,0,0,1,0,0,1
416,3,29.881138,8.0500,0,1,0,0,1,0,0,1


In [19]:
scaler = StandardScaler()
merged_df[['Age', 'Fare']] = scaler.fit_transform(merged_df[['Age', 'Fare']])

In [20]:
train_X = merged_df[:train_size]
test_X = merged_df[train_size:]
test_X.fillna(test_X['Fare'].mode(), inplace=True)

In [21]:
pd.merge(pd.DataFrame(train_X),
         train_Y,
         right_index=True, left_index=True).corr()['Survived']

Pclass          -0.338481
Age             -0.070323
Fare             0.257307
Sex_female       0.543351
Sex_male        -0.543351
Embarked_C       0.168240
Embarked_Q       0.003650
Embarked_S      -0.149683
Family_large    -0.125147
Family_medium    0.279855
Family_single   -0.203367
Survived         1.000000
Name: Survived, dtype: float64

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    train_X, train_Y, test_size=0.3, random_state=42)

In [30]:
def get_answer(pred):
    return np.where(pred>0.5, 1, 0).squeeze()

def compare_test(answer, y):
    compare = (answer==y)
    print(compare.value_counts()[True] / len(compare))
    
def submission_output(answer):
    submission_df['Survived'] = answer
    submission_df.to_csv('submission.csv', index=False)
    !kaggle competitions submit -c titanic -f submission.csv -m ""

0.5335820895522388


### Logistic Regression

In [32]:
lr = LogisticRegression()
lr.fit(X_train, Y_train)

In [33]:
answer_train=lr.predict(X_test)
compare_test(answer_train,Y_test)

0.8097014925373134


In [46]:
answer_test = lr.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 22.7kB/s]
100%|##########| 3.18k/3.18k [00:02<00:00, 1.19kB/s]


### Decision Tree Classifier

In [34]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)

In [35]:
answer_train=dtc.predict(X_test)
compare_test(answer_train,Y_test)

0.7873134328358209


In [37]:
answer_test = dtc.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 22.4kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 1.07kB/s]


### Random Forest Classifier

In [36]:
rd_clf = RandomForestClassifier(n_estimators=100, max_depth=5)
rd_clf.fit(X_train, Y_train)

In [37]:
answer_train=rd_clf.predict(X_test)
compare_test(answer_train,Y_test)

0.8097014925373134


In [40]:
answer_test = rd_clf.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 22.2kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 862B/s]  


### Random Forest Classifier with GridSearchCV

In [38]:
rd_clf2 = RandomForestClassifier()
hyper_parameter = {
    'n_estimators':[100, 150, 200, 250, 300, 350, 400],
    'max_depth':[5,8,10,12,15,20]
}

hyper_parameter_tuner = GridSearchCV(estimator=rd_clf2, param_grid=hyper_parameter, cv= 5)
hyper_parameter_tuner.fit(X_train, Y_train)
params = hyper_parameter_tuner.best_params_

In [39]:
rd_clf2 = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'])
rd_clf2.fit(X_train, Y_train)

In [40]:
answer_train=rd_clf.predict(X_test)
compare_test(answer_train,Y_test)

0.8097014925373134


In [47]:
answer_test = rd_clf.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 22.9kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 871B/s]  


### Gradient Boosting Classifier

In [41]:
gb = GradientBoostingClassifier()
gb.fit(X_train, Y_train)

In [42]:
answer_train=gb.predict(X_test)
compare_test(answer_train,Y_test)

0.8134328358208955


In [45]:
answer_test = gb.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 21.9kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 1.08kB/s]


### XGB Classifier

In [48]:
xgb = XGBClassifier(learning_rate=0.001,n_estimators=2500,
                                max_depth=4, min_child_weight=0,
                                gamma=0, subsample=0.7,
                                colsample_bytree=0.7,
                                scale_pos_weight=1, seed=27,
                                reg_alpha=0.00006)
xgb.fit(X_train, Y_train)

In [49]:
answer_train=xgb.predict(X_test)
compare_test(answer_train,Y_test)

0.8097014925373134


In [46]:
answer_test = xgb.predict(test_X)
submission_output(answer_test)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 22.5kB/s]
100%|##########| 3.18k/3.18k [00:03<00:00, 827B/s]  


### Ensemble with previous Models

In [61]:
ensemble = pd.DataFrame([lr.predict(X_test), dtc.predict(X_test), rd_clf.predict(X_test), gb.predict(X_test), xgb.predict(X_test)]).T
ensemble.columns = ['lr', 'dtc', 'rd_clf', 'gb', 'xgb']
ensemble

ensemble['result'] = ensemble.sum(axis=1) > 2
to_binary = {False : 0, True : 1}
ensemble['result'].replace(to_binary, inplace=True)
ensemble.reset_index()

compare_test(ensemble['result'],Y_test.reset_index(drop=True))

0.8134328358208955


In [62]:
ensemble = pd.DataFrame([lr.predict(test_X), dtc.predict(test_X), rd_clf.predict(test_X), gb.predict(test_X), xgb.predict(test_X)]).T
ensemble.columns = ['lr', 'dtc', 'rd_clf', 'gb', 'xgb']
ensemble

ensemble['result'] = ensemble.sum(axis=1) > 2
to_binary = {False : 0, True : 1}
ensemble['result'].replace(to_binary, inplace=True)
ensemble

Unnamed: 0,lr,dtc,rd_clf,gb,xgb,result
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,1,0,0,0,0
3,0,0,0,0,0,0
4,1,0,1,0,1,1
...,...,...,...,...,...,...
413,0,0,0,0,0,0
414,1,1,1,1,1,1
415,0,0,0,0,0,0
416,0,0,0,0,0,0


In [63]:
submission_output(ensemble['result'])

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 21.0kB/s]
100%|##########| 3.18k/3.18k [00:02<00:00, 1.09kB/s]
