In [237]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import  KFold
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

seed = np.random.seed(100)

Load data

In [206]:
train_data = pd.read_csv('train.csv')
train_data = train_data.fillna(train_data['Age'].mean())

test_data = pd.read_csv('test.csv')
test_data = test_data.fillna(test_data['Age'].mean())

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        891 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [207]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Data preparation and preprocessing

In [208]:
colns = ['PassengerId', 'Pclass', 'Sex', 'Age', 'Survived']
train_set = train_data[colns].set_index('PassengerId') #.fillna(train_data.mean())


test_set = test_data[colns[:-1]].set_index('PassengerId') #.fillna(test_data.mean())

# add a new column category for the age group. Groups are chosen based on the classification from
# https://ieeexplore.ieee.org/document/6416855

# Train
category1 = pd.cut(train_set['Age'], bins=[0,12,18,59,80], 
                      labels=['child','adolescent','adult','elderly'])
category2 = pd.cut(test_set['Age'], bins=[0,12,18,59,80], 
                      labels=['child','adolescent','adult','elderly'])
train_set['Age group']  = category1
test_set['Age group']  = category2
update_train = train_set.drop('Age', axis=1)

# Test
test_set['Age group']  = category2
X_test = test_set.drop('Age', axis=1)


In [209]:
encoder = LabelEncoder()
update_train['Sex'] = encoder.fit_transform(update_train['Sex'])
update_train['Age group'] = encoder.fit_transform(update_train['Age group'])

test_set['Sex'] = encoder.fit_transform(test_set['Sex'])
test_set['Age group'] = encoder.fit_transform(test_set['Age group'])


X_train = update_train.drop('Survived', axis=1)
y_train = update_train[['Survived']]

test_set = test_set.drop('Age', axis=1)
# update_train.isna().sum()

Model selection

In [233]:
# create the search space
hyperparams={ 
'Logistic Regression':
    {'model':LogisticRegression(),
    'params':{
    'solver' : ['lbfgs', 'liblinear']
    }
  },
'Gradient Boost':
    {'model':GradientBoostingClassifier(),
    'params':{'n_estimators':[20,50,100],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
    'loss' : ['deviance', 'exponential']
    }
  },
'Random forest':
    {'model':RandomForestClassifier(),
    'params':{
      'n_estimators':[20,50,100],
      'criterion':['gini','entropy'],
      'max_depth':[2, 3, 5, 10],
      'bootstrap':[True, False]
      }
  }
}

In [234]:
# create k-folds for cross validation
kfold = KFold(n_splits=5, random_state=0, shuffle=True)
accuracy_scores = list()
iteration = 1
keys=hyperparams.keys()

# Parameter tuning for Model selection using grid search method
for key in hyperparams.keys():
    g_search=GridSearchCV(hyperparams[key]['model'], hyperparams[key]['params'],
                          cv=kfold, return_train_score=True, verbose=1)
    g_search.fit(X_train,y_train)

    print(f'Iteration: {iteration}')
    accuracy_scores.append({'model':key,
                            'best_score':g_search.best_score_,
                            'best_params':g_search.best_params_
                           })
    iteration += 1

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Iteration: 1
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Iteration: 2
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Iteration: 3


In [235]:
# Display search results in a dataframe
pd.set_option('display.max_colwidth', None)
accuracy_df = pd.DataFrame(accuracy_scores,columns=['model','best_params','best_score'])
accuracy_df

Unnamed: 0,model,best_params,best_score
0,Logistic Regression,{'solver': 'lbfgs'},0.781137
1,Gradient Boost,"{'criterion': 'friedman_mse', 'loss': 'exponential', 'n_estimators': 20}",0.783359
2,Random forest,"{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 2, 'n_estimators': 20}",0.797991


In [236]:
rfc= RandomForestClassifier(n_estimators=20, 
                            criterion='entropy', 
                            max_depth=2, 
                            bootstrap=False,
                            random_state=0).fit(X_train, y_train)
predictions = rfc.predict(test_set)

predicted = pd.DataFrame(list(predictions), index=list(range(892, 1310)), 
                              columns=y_train.columns).rename_axis('PassengerId')
predicted.to_csv('titanic_predictions.csv')
predicted

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,1
894,0
895,0
896,1
...,...
1305,0
1306,1
1307,0
1308,0
