## **Titanic - Machine Learning from Disaster**<br>
The "Getting Started" Kaggle Competition, with an objective to predict the 418 passengers' survival on the Titanic.

In [414]:
#!pip install -q kaggle

In [415]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, recall_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, plot_roc_curve, precision_score
import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

In [416]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [417]:
#importing the kaggle dataset
#import os
#os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Colab Notebooks/Kaggle/Titanic'

#changing to the working directory
#%cd /content/drive/MyDrive/Colab Notebooks/Kaggle/Titanic

#Kaggle API command for the dataset
#!kaggle competitions download -c titanic

In [418]:
# specifying the filepaths
submission = '/content/drive/MyDrive/Colab Notebooks/Kaggle/Titanic/gender_submission.csv'
test = '/content/drive/MyDrive/Colab Notebooks/Kaggle/Titanic/test.csv'
train = '/content/drive/MyDrive/Colab Notebooks/Kaggle/Titanic/train.csv'

**1) Loading the Data**

In [419]:
testraw = pd.read_csv(test)
testraw.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [420]:
trainraw = pd.read_csv(train)
trainraw.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


**2) Data Cleaning/ Feature Extraction**

Items to check for:<br>


*   Duplicated records
*   Missing data
*   Inconsistent presentations (e.g. upper/ lower case)




In [421]:
print('Row/ column of the dataset:', trainraw.shape)

Row/ column of the dataset: (891, 12)


In [422]:
# Check for duplciated records
trainraw.duplicated().value_counts()

False    891
dtype: int64

In [423]:
# Check for missing values in the dataset
trainraw.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [424]:
# dropping Cabin, as majority of the data is null
trainraw = trainraw.drop('Cabin', axis=1)

In [425]:
# Clean up Embarked
trainraw[trainraw['Embarked'].isnull()==True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


In [426]:
trainraw['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [427]:
# fill Embarked with most frequent value
trainraw['Embarked'] = trainraw['Embarked'].fillna('S')
trainraw['Embarked'].isnull().any()

False

In [428]:
# replacing Male/ Female to 0 & 1
trainraw['Sex'] = trainraw['Sex'].replace({'male':0, 'female':1})

In [429]:
# Cleaning up inconsistent format in Name
trainraw['Name'] = trainraw['Name'].str.upper()
trainraw['Name'].head(10)

0                              BRAUND, MR. OWEN HARRIS
1    CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
2                               HEIKKINEN, MISS. LAINA
3         FUTRELLE, MRS. JACQUES HEATH (LILY MAY PEEL)
4                             ALLEN, MR. WILLIAM HENRY
5                                     MORAN, MR. JAMES
6                              MCCARTHY, MR. TIMOTHY J
7                       PALSSON, MASTER. GOSTA LEONARD
8    JOHNSON, MRS. OSCAR W (ELISABETH VILHELMINA BERG)
9                  NASSER, MRS. NICHOLAS (ADELE ACHEM)
Name: Name, dtype: object

Format of the name is organized by last name, followed by title. Further features can be extracted for analysis.

In [430]:
# split columns accordingly
Name = trainraw['Name'].str.split('[,.]', expand=True)
Name.rename(columns={0: 'Last Name', 1: 'Title', 2: 'First Name'}, inplace=True)

In [431]:
# Merge Name with original dataframe
trainraw = trainraw.merge(Name, how='left', left_index=True, right_index=True)

In [432]:
trainraw = trainraw.drop(['Name', 3], axis=1)
trainraw.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Last Name,Title,First Name
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,S,BRAUND,MR,OWEN HARRIS
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C,CUMINGS,MRS,JOHN BRADLEY (FLORENCE BRIGGS THAYER)


In [433]:
#check the titles
trainraw['Title'].value_counts()

 MR              517
 MISS            182
 MRS             125
 MASTER           40
 DR                7
 REV               6
 MLLE              2
 MAJOR             2
 COL               2
 SIR               1
 LADY              1
 MME               1
 CAPT              1
 MS                1
 DON               1
 JONKHEER          1
 THE COUNTESS      1
Name: Title, dtype: int64

Reducing/ Grouping the titles - Mr, Sir, Don/ MRS, Miss, MS, Lady, MLLE, MME/ ranked - Major, Col, CAPT, DR, The Countess/ master

In [434]:
# cleaning up titles 
trainraw['Title'] = trainraw['Title'].replace(dict.fromkeys(['DON', 'SIR'], 'MR'), regex=True).str. strip(' ')
trainraw['Title'] = trainraw['Title'].replace(dict.fromkeys(['MISS', 'MLLE', 'LADY', 'MME'], 'MS'), regex=True).str. strip(' ')
trainraw['Title'] = trainraw['Title'].replace(dict.fromkeys(['DR', 'REV', 'MAJOR', 'COL', 'THE COUNTESS', 'JONKHEER', 'CAPT'], 'RANKED'), regex=True).str. strip(' ')

In [435]:
# mean age by age group
master = round(trainraw[trainraw['Title']=='MASTER']['Age'].mean(),2)
mr = round(trainraw[trainraw['Title']=='MR']['Age'].mean(),2)
mrs = round(trainraw[trainraw['Title']=='MRS']['Age'].mean(),2)
ms = round(trainraw[trainraw['Title']=='MS']['Age'].mean(),2)
ranked = round(trainraw[trainraw['Title']=='RANKED']['Age'].mean(),2)
print('mean age of (master/ mr/ mrs/ ms/ ranked):', master, mr, mrs, ms, ranked)

mean age of (master/ mr/ mrs/ ms/ ranked): 4.57 32.43 35.9 22.03 45.53


In [436]:
# fillna by group's mean age
trainraw.loc[trainraw['Title'] == 'MASTER', 'Age'] = trainraw.loc[trainraw['Title'] == 'MASTER', 'Age'].fillna(master)
trainraw.loc[trainraw['Title'] == 'MR', 'Age'] = trainraw.loc[trainraw['Title'] == 'MR', 'Age'].fillna(mr)
trainraw.loc[trainraw['Title'] == 'MS', 'Age'] = trainraw.loc[trainraw['Title'] == 'MS', 'Age'].fillna(ms)
trainraw.loc[trainraw['Title'] == 'MRS', 'Age'] = trainraw.loc[trainraw['Title'] == 'MRS', 'Age'].fillna(mrs)
trainraw.loc[trainraw['Title'] == 'RANKED', 'Age'] = trainraw.loc[trainraw['Title'] == 'RANKED', 'Age'].fillna(ranked)

In [437]:
# examining the data type of the train set
trainraw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Embarked     891 non-null    object 
 10  Last Name    891 non-null    object 
 11  Title        891 non-null    object 
 12  First Name   891 non-null    object 
dtypes: float64(2), int64(6), object(5)
memory usage: 90.6+ KB


**3) Pre-Processing & Feature Engineering**

Further steps is required before model building: <br>
*   Remove unique identifiers (passenger id, first name, last name, ticket)
*   One hot code important categorical data
*   Additional feature engineering
*   Train/ Test Split







In [438]:
trainraw.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Last Name,Title,First Name
0,1,0,3,0,22.0,1,0,A/5 21171,7.25,S,BRAUND,MR,OWEN HARRIS
1,2,1,1,1,38.0,1,0,PC 17599,71.2833,C,CUMINGS,MRS,JOHN BRADLEY (FLORENCE BRIGGS THAYER)


In [439]:
# removing unique identifier columns
dropcolumns = ['PassengerId', 'Ticket', 'Last Name', 'First Name']
trainraw = trainraw.drop(dropcolumns, axis=1)

In [440]:
# one-hot-coding categorical data which will facilitate modelling
codedcolumns = ['Pclass', 'Embarked', 'Title']
df = pd.get_dummies(trainraw, columns = codedcolumns)
df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_MASTER,Title_MR,Title_MRS,Title_MS,Title_RANKED
0,0,0,22.0,1,0,7.25,0,0,1,0,0,1,0,1,0,0,0
1,1,1,38.0,1,0,71.2833,1,0,0,1,0,0,0,0,1,0,0
2,1,1,26.0,0,0,7.925,0,0,1,0,0,1,0,0,0,1,0
3,1,1,35.0,1,0,53.1,1,0,0,0,0,1,0,0,1,0,0
4,0,0,35.0,0,0,8.05,0,0,1,0,0,1,0,1,0,0,0


In [441]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

In [442]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
  count = 0
  while max_vif > vif_threshold:
    count += 1
    print("Iteration # "+str(count))
    vif = pd.DataFrame()
    vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif["features"] = df.columns

    if vif['VIFactor'].max() > vif_threshold:
      print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], vif['VIFactor'].max()))
      df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
      max_vif = vif['VIFactor'].max()
    else:
      print('Complete')
      return df, vif.sort_values('VIFactor')

final_df, final_vif = iterate_vif(X_train)

Iteration # 1
Removing Age with VIF of 5.216249
Iteration # 2
Complete


In [443]:
final_df.head()

Unnamed: 0,SibSp,Parch,Fare,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Title_MASTER,Title_MRS,Title_MS,Title_RANKED
324,8,2,69.55,0,1,0,1,0,0,0,0
518,1,0,26.0,1,0,0,1,0,1,0,0
131,0,0,7.05,0,1,0,1,0,0,0,0
178,0,0,13.0,1,0,0,1,0,0,0,0
354,0,0,7.225,0,1,0,0,0,0,0,0


In [444]:
final_vif.sort_values(by='VIFactor', ascending=False)

Unnamed: 0,VIFactor,features
6,3.806781,Embarked_S
4,3.234653,Pclass_3
3,1.840784,Pclass_2
0,1.698009,SibSp
1,1.67384,Parch
5,1.539756,Embarked_Q
2,1.469724,Fare
9,1.443519,Title_MS
7,1.375477,Title_MASTER
8,1.359715,Title_MRS


Building a Train Test Split

In [445]:
final_features = final_df.columns.tolist()

In [446]:
X = df[final_features]
y = df['Survived']

In [447]:
#building a train test split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25)

**4) Modelling**

4.1) Logistic Regression

In [448]:
# build the model
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_model = lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [449]:
# getting the scores
lr_acc = accuracy_score(y_test, y_pred)
lr_roc = roc_auc_score(y_test, y_pred)
lr_per = precision_score(y_test, y_pred, average='binary')

print('Accuracy score:', lr_acc)
print('ROC score:', lr_roc)
print('Precision score:', lr_per)

Accuracy score: 0.7937219730941704
ROC score: 0.7821637426900584
Precision score: 0.7558139534883721


4.2) XGBoost Regressor

In [450]:
# build the model
xgb = XGBClassifier()
xgb_model = xgb.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

In [451]:
# getting the scores
xgbacc = accuracy_score(y_test, y_pred)
xgb_roc = roc_auc_score(y_test, y_pred)
xgb_per = precision_score(y_test, y_pred, average='binary')

print('Accuracy score:', xgbacc)
print('ROC score:', xgb_roc)
print('Precision score:', xgb_per)

Accuracy score: 0.7802690582959642
ROC score: 0.7637009189640769
Precision score: 0.7530864197530864


**5) Model Evaluation & Optimization**

In [452]:
model_comp = pd.DataFrame({'Metrics/ Model': ['Accuracy Score (Test data)', 'ROC Score', 'Precision Score'],
                           'Logistic Regression': [lr_acc, lr_roc, lr_per],
                           'XGBoost': [xgbacc, xgb_roc, xgb_per],
                          }).round(3)
model_comp

Unnamed: 0,Metrics/ Model,Logistic Regression,XGBoost
0,Accuracy Score (Test data),0.794,0.78
1,ROC Score,0.782,0.764
2,Precision Score,0.756,0.753


In [453]:
#cross validation
cross_val_accuracy = cross_val_score(lr_model, X, y, scoring='accuracy', cv=5)
print('Accuracy of the model with cross-validation is: ', (np.mean(cross_val_accuracy)*100).round(2),'%')

Accuracy of the model with cross-validation is:  81.48 %


In [454]:
lr_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [455]:
# the same parameters as for xgboost model
params_lr1 = {'C': 1.0, 
            'verbose': 0,
            'max_iter': 1000,
              }     
lr1 = LogisticRegression(**params_lr1)
lr1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [456]:
params_grid = {  
    'penalty' : ['l1', 'l2'],
    'intercept_scaling': [1, 5, 10],
              }

search_lr1 = GridSearchCV(lr1, params_grid, cv = 3, verbose=True, n_jobs=-1) 
search_lr1.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    1.4s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'intercept_scaling': [1, 5, 10],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=True)

In [457]:
# build the model
lr1 = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2')
lr1_model = lr1.fit(X_train, y_train)
y_pred = lr1.predict(X_test)

In [458]:
# getting the scores
lr_acc1 = accuracy_score(y_test, y_pred)
lr_roc1 = roc_auc_score(y_test, y_pred)
lr_per1 = precision_score(y_test, y_pred, average='binary')

print('Accuracy score:', lr_acc1)
print('ROC score:', lr_roc1)
print('Precision score:', lr_per1)

Accuracy score: 0.7937219730941704
ROC score: 0.7821637426900584
Precision score: 0.7558139534883721


In [459]:
#cross validation
cross_val_accuracy = cross_val_score(lr1_model, X, y, scoring='accuracy', cv=5)
print('Accuracy of the optimized model with cross-validation is: ', (np.mean(cross_val_accuracy)*100).round(2),'%')

Accuracy of the optimized model with cross-validation is:  81.48 %


**6) Running the Model on Test Set**

Clean up is still required on the test set in similar procedures:


*   Drop columns with unique identifiers or too much missing data (Passenger id, ticket, cabin)
*   Fill in missing data (Age) by group
*   Hot Code important categorical data





In [460]:
print(testraw.shape)
testraw.head(2)

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [461]:
#check for missing data
testraw.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [462]:
# replacing Male/ Female to 0 & 1
testraw['Sex'] = testraw['Sex'].replace({'male':0, 'female':1})

In [463]:
# Cleaning up inconsistent format in Name
testraw['Name'] = testraw['Name'].str.upper()
testraw['Name'].head(3)

0                    KELLY, MR. JAMES
1    WILKES, MRS. JAMES (ELLEN NEEDS)
2           MYLES, MR. THOMAS FRANCIS
Name: Name, dtype: object

In [464]:
# split columns accordingly
Name1 = testraw['Name'].str.split('[,.]', expand=True)
Name1.rename(columns={0: 'Last Name', 1: 'Title', 2: 'First Name'}, inplace=True)

In [465]:
Name1

Unnamed: 0,Last Name,Title,First Name
0,KELLY,MR,JAMES
1,WILKES,MRS,JAMES (ELLEN NEEDS)
2,MYLES,MR,THOMAS FRANCIS
3,WIRZ,MR,ALBERT
4,HIRVONEN,MRS,ALEXANDER (HELGA E LINDQVIST)
...,...,...,...
413,SPECTOR,MR,WOOLF
414,OLIVA Y OCANA,DONA,FERMINA
415,SAETHER,MR,SIMON SIVERTSEN
416,WARE,MR,FREDERICK


In [466]:
# Merge Name with original dataframe
testraw = testraw.merge(Name1, how='left', left_index=True, right_index=True)

testraw = testraw.drop(['Name'], axis=1)
testraw.head(2)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Last Name,Title,First Name
0,892,3,0,34.5,0,0,330911,7.8292,,Q,KELLY,MR,JAMES
1,893,3,1,47.0,1,0,363272,7.0,,S,WILKES,MRS,JAMES (ELLEN NEEDS)


In [467]:
testraw['Title'].unique()

array([' MR', ' MRS', ' MISS', ' MASTER', ' MS', ' COL', ' REV', ' DR',
       ' DONA'], dtype=object)

In [468]:
# cleaning up titles 
testraw['Title'] = testraw['Title'].replace(dict.fromkeys(['MISS', 'DONA'], 'MS'), regex=True).str. strip(' ')
testraw['Title'] = testraw['Title'].replace(dict.fromkeys(['DR', 'REV', 'COL'], 'RANKED'), regex=True).str. strip(' ')

In [469]:
# mean age by age group
master1 = round(testraw[testraw['Title']=='MASTER']['Age'].mean(),2)
mr1 = round(testraw[testraw['Title']=='MR']['Age'].mean(),2)
mrs1 = round(testraw[testraw['Title']=='MRS']['Age'].mean(),2)
ms1 = round(testraw[testraw['Title']=='MS']['Age'].mean(),2)
ranked1 = round(testraw[testraw['Title']=='RANKED']['Age'].mean(),2)
print('mean age of (master/ mr/ mrs/ ms/ ranked):', master1, mr1, mrs1, ms1, ranked1)

mean age of (master/ mr/ mrs/ ms/ ranked): 7.41 32.0 38.9 22.04 44.8


In [470]:
# fillna by group's mean age
testraw.loc[testraw['Title'] == 'MASTER', 'Age'] = testraw.loc[testraw['Title'] == 'MASTER', 'Age'].fillna(master1)
testraw.loc[testraw['Title'] == 'MR', 'Age'] = testraw.loc[testraw['Title'] == 'MR', 'Age'].fillna(mr1)
testraw.loc[testraw['Title'] == 'MS', 'Age'] = testraw.loc[testraw['Title'] == 'MS', 'Age'].fillna(ms1)
testraw.loc[testraw['Title'] == 'MRS', 'Age'] = testraw.loc[testraw['Title'] == 'MRS', 'Age'].fillna(mrs1)
testraw.loc[testraw['Title'] == 'RANKED', 'Age'] = testraw.loc[testraw['Title'] == 'RANKED', 'Age'].fillna(ranked1)

In [471]:
# removing unique identifier columns
dropcolumns1 = ['Ticket', 'Last Name', 'First Name', 'Cabin']
testraw = testraw.drop(dropcolumns1, axis=1)

In [472]:
# one-hot-coding categorical data which will facilitate modelling
codedcolumns1 = ['Pclass', 'Embarked', 'Title']
dftest = pd.get_dummies(testraw, columns = codedcolumns1)
dftest.head(3)

Unnamed: 0,PassengerId,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_MASTER,Title_MR,Title_MRS,Title_MS,Title_RANKED
0,892,0,34.5,0,0,7.8292,0,0,1,0,1,0,0,1,0,0,0
1,893,1,47.0,1,0,7.0,0,0,1,0,0,1,0,0,1,0,0
2,894,0,62.0,0,0,9.6875,0,1,0,0,1,0,0,1,0,0,0


In [473]:
dftest['Fare'].mean()

35.6271884892086

In [474]:
dftest['Fare'].fillna(35, inplace=True)

In [475]:
columns = final_df.columns.tolist()

In [476]:
dftest[columns].isnull().sum()

SibSp           0
Parch           0
Fare            0
Pclass_2        0
Pclass_3        0
Embarked_Q      0
Embarked_S      0
Title_MASTER    0
Title_MRS       0
Title_MS        0
Title_RANKED    0
dtype: int64

In [477]:
# build the model
lrf = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2')
lrf_model = lrf.fit(X_train, y_train)
test_predictions = lrf.predict(dftest[columns])
test_predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

Creating the Submission file

In [478]:
pids = testraw['PassengerId']
submit_df = {'PassengerId': pids,
             "Survived": test_predictions}
submission = pd.DataFrame.from_dict(submit_df)

In [479]:
submission.head(3)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0


In [480]:
submission.to_csv('titanic_submission1.csv', index=False)