## Process Titanic Test Data
Submit predictions to Kaggle.com for an estimate of accuracy

In [1]:
# Import libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Workshop Functions
import sys
sys.path.append('..')
from Wksp722_functions import * 

In [2]:
# first read in the data
df = pd.read_csv("titanic_test_cleaned.csv")
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Salutation
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Mrs.


In [3]:
df.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
Salutation     0
dtype: int64

I don't have a 'Dona' salutation in my training set, so we need to change those values to "Mrs.".  
There is only one in the test set and her age is 39, which is close to the median age of "Mrs." (35 years)

In [4]:
df['Salutation'].unique()
print(df[df['Salutation']=='Dona.'])
df.loc[df.loc[:,'Salutation']=='Dona.','Salutation'] = 'Mrs.'

     PassengerId  Pclass                          Name     Sex   Age  SibSp  \
414         1306       1  Oliva-y-Ocana, Dona. Fermina  female  39.0      0   

     Parch    Ticket   Fare Embarked Salutation  
414      0  PC 17758  108.9        C      Dona.  


In [5]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Salutation
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S,Mrs.


In [6]:
PID = df.loc[:,'PassengerId']
x_test =titanicNumericalConverter(df)

In [7]:
# Load the trained classifier
# Read in the classifier trained in the previous lesson
import pickle
RF_Final = pickle.load(open('RF_Final.pkl', 'rb'))

In [8]:
x_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Salutation_Col.,Salutation_Dr.,Salutation_Master.,Salutation_Miss.,Salutation_Mr.,Salutation_Mrs.,Salutation_Ms.,Salutation_Rev.
0,3,0,34.5,0,0,7.8292,False,True,False,False,False,False,False,True,False,False,False
1,3,1,47.0,1,0,7.0,False,False,True,False,False,False,False,False,True,False,False
2,2,0,62.0,0,0,9.6875,False,True,False,False,False,False,False,True,False,False,False
3,3,0,27.0,0,0,8.6625,False,False,True,False,False,False,False,True,False,False,False
4,3,1,22.0,1,1,12.2875,False,False,True,False,False,False,False,False,True,False,False


In [9]:
temp = pd.read_csv('titanic_train_columns.csv')
print(temp)

Empty DataFrame
Columns: [Unnamed: 0, Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked_C, Embarked_Q, Embarked_S, Salutation_Capt., Salutation_Col., Salutation_Countess., Salutation_Don., Salutation_Dr., Salutation_Jonkheer., Salutation_Lady., Salutation_Major., Salutation_Master., Salutation_Miss., Salutation_Mlle., Salutation_Mme., Salutation_Mr., Salutation_Mrs., Salutation_Ms., Salutation_Rev., Salutation_Sir.]
Index: []

[0 rows x 28 columns]


In [10]:
x_test = M3L3_titanicTest_colInsert(x_test)
x_test.columns

Salutation_Capt. 9
Salutation_Countess. 11
Salutation_Don. 12
Salutation_Jonkheer. 14
Salutation_Lady. 15
Salutation_Major. 16
Salutation_Mlle. 19
Salutation_Mme. 20
Salutation_Sir. 25


Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Salutation_Capt.', 'Salutation_Col.',
       'Salutation_Countess.', 'Salutation_Don.', 'Salutation_Dr.',
       'Salutation_Jonkheer.', 'Salutation_Lady.', 'Salutation_Major.',
       'Salutation_Master.', 'Salutation_Miss.', 'Salutation_Mlle.',
       'Salutation_Mme.', 'Salutation_Mr.', 'Salutation_Mrs.',
       'Salutation_Ms.', 'Salutation_Rev.', 'Salutation_Sir.'],
      dtype='object')

In [11]:
# Next use the model to predict the survival of the passengers in this new test data
y_pred = RF_Final.predict(x_test)

In [12]:
x_test.isnull().sum()

Pclass                  0
Sex                     0
Age                     0
SibSp                   0
Parch                   0
Fare                    0
Embarked_C              0
Embarked_Q              0
Embarked_S              0
Salutation_Capt.        0
Salutation_Col.         0
Salutation_Countess.    0
Salutation_Don.         0
Salutation_Dr.          0
Salutation_Jonkheer.    0
Salutation_Lady.        0
Salutation_Major.       0
Salutation_Master.      0
Salutation_Miss.        0
Salutation_Mlle.        0
Salutation_Mme.         0
Salutation_Mr.          0
Salutation_Mrs.         0
Salutation_Ms.          0
Salutation_Rev.         0
Salutation_Sir.         0
dtype: int64

In [13]:
# Next use the model to predict the survival of the passengers in this new test data
y_pred = RF_Final.predict(x_test)

# format the file for submitting to Kaggle.com
output = pd.DataFrame({'PassengerId': PID, 'Survived': y_pred})
output.to_csv('Patel_submission.csv', index=False)

#### When submitted to Kaggle.com competition, results in 0.76555 score, which the website indicates is accuracy.  

This is a Kaggle notebook that scored 100%.  https://www.kaggle.com/code/soham1024/titanic-data-science-eda-with-meme-solution
***Curiosity Points (10 points)*** Review the notebook and determine the differences in feature engineering that led to the improvement in accuracy