### Objective of this exercise is to compare K Nearest Neighbor and Random Forest prediction models on this Titanic dataset. I want to use both models to predict the survival rate of each passenger onboard. Survived is prepresented by 1.

In [115]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [116]:
df = pd.read_csv('titanic_dataset.csv')

In [117]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


##### Age could be an important factor in the prediction. Even though it's missing some values, I can keep this column and fill the missing values with mean age in the dataset

In [119]:
# fill the missing age with mean
mean_age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_age)

##### Cabin locations could possibily affect passenger survival rate. Because the number of values are less than ideal, I decided to take it off.  PassengerID column can be dropped. It doesn't add any real value.

In [120]:
# drop Cabin & PassengerID columns
df = df.drop(['PassengerId', 'Cabin'], axis = 1)

##### Embarked column tells you the port where the passengers are picked up from. I will populate missing values with mode.

In [121]:
# add s to Embarked column
embarked_port = 'S'
df['Embarked'] = df['Embarked'].fillna(embarked_port)

##### Convert strings to integers by adding a new column for each value and assigning [0,1] on each column based on whether the column is represented by the sex and embarked columns. This will change gender (Sex) and Embarked columns from strings to integers. Then they added back into dataframe as male/female and CQS (letter stands for the embarked port of orgin)

In [122]:
# convert categorical variable to dummy variable then add back to dataframe
df_sex = pd.get_dummies(df['Sex'])
df_em = pd.get_dummies(df['Embarked'])
df2 = pd.concat([df, df_em, df_sex], axis = 1)

In [123]:
df2.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,C,Q,S,female,male
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S,0,0,1,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,1,0,0,1,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S,0,0,1,1,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S,0,0,1,1,0
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S,0,0,1,0,1


##### Start of model evaluation using K Nearest Neighbor and Random Forest 

In [124]:
# defining X and y
X = df2[['Pclass', 'female', 'SibSp','Age','Parch','Fare']]
y = df2['Survived']

###### Start of K Nearest Neighbor - https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm

In [125]:
# instantiating a K Nearest Neighbor model with the default arguments
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 5) 

In [126]:
# import classes and functions from the library to be used in cross validation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score


ss = StandardScaler() 

# packages scaler and model together
pipeline = Pipeline([('transformer', ss), ('estimator', clf)])

# how we want to split up the data
skf = StratifiedKFold(n_splits=5,random_state=42)

# using cross_val_score to train 80% of data and test remaining 20%. Generate 5 scores as it tests 5 different times
scores = cross_val_score(pipeline, X, y, cv = skf) 

In [127]:
scores


array([0.79888268, 0.78212291, 0.82022472, 0.84269663, 0.83050847])

###### Start of Random Forest - https://en.wikipedia.org/wiki/Random_forest


In [128]:
# instantiating a K Random Forest model with the default arguments
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()

In [129]:
from sklearn.model_selection import cross_val_score, StratifiedKFold  

# how we want to split up the data 
skf_rf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

# using cross_val_score to train 80% of data and test remaining 20%. Generate 5 scores as it tests 5 different times
score_rf = cross_val_score(clf_rf, X, y, cv = skf_rf)

In [130]:
score_rf

array([0.81005587, 0.77094972, 0.80898876, 0.85393258, 0.82485876])

In [133]:
df2['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [134]:
#baseline
549/ (549 + 342)

0.6161616161616161

#### Conclusion is that both models performed equally well and significantly better than the baseline at 62%