In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#for random forests
from sklearn.ensemble import RandomForestClassifier

#hyperparameter tuning, changing part of model to make it more accurate
from sklearn.model_selection import RandomizedSearchCV

#can download data from websites! heres one
# pip install quandl in python command window
import quandl

In [176]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [177]:
#Drop any variables that are redundant and will add to multicollinearity.
titanic= titanic.drop(['class','embark_town','alive','adult_male'], axis=1)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck,alone
0,0,3,male,22.0,1,0,7.2500,S,man,,False
1,1,1,female,38.0,1,0,71.2833,C,woman,C,False
2,1,3,female,26.0,0,0,7.9250,S,woman,,True
3,1,1,female,35.0,1,0,53.1000,S,woman,C,False
4,0,3,male,35.0,0,0,8.0500,S,man,,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,man,,True
887,1,1,female,19.0,0,0,30.0000,S,woman,B,True
888,0,3,female,,1,2,23.4500,S,woman,,False
889,1,1,male,26.0,0,0,30.0000,C,man,C,True


In [178]:
#Remove missing data
titanic= titanic.dropna(inplace=False)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck,alone
1,1,1,female,38.0,1,0,71.2833,C,woman,C,False
3,1,1,female,35.0,1,0,53.1000,S,woman,C,False
6,0,1,male,54.0,0,0,51.8625,S,man,E,True
10,1,3,female,4.0,1,1,16.7000,S,child,G,False
11,1,1,female,58.0,0,0,26.5500,S,woman,C,True
...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,woman,D,False
872,0,1,male,33.0,0,0,5.0000,S,man,B,True
879,1,1,female,56.0,0,1,83.1583,C,woman,C,False
887,1,1,female,19.0,0,0,30.0000,S,woman,B,True


In [179]:
#Recode string data (use dummy coding to make temp chart with all variables)
cleanup= {"sex":{"female": 0, "male":1},
         "embarked":{"C":0, "S":1, "Q":2}, 
         "deck":{"A":0, "B":1, "C":2, "D":3, "E":4, "F":5, "G":6},
         "who":{"man":0, "woman":1, "child":2}}

titanic1= titanic.replace(cleanup, inplace=False)
titanic1

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,deck,alone
1,1,1,0,38.0,1,0,71.2833,0,1,2,False
3,1,1,0,35.0,1,0,53.1000,1,1,2,False
6,0,1,1,54.0,0,0,51.8625,1,0,4,True
10,1,3,0,4.0,1,1,16.7000,1,2,6,False
11,1,1,0,58.0,0,0,26.5500,1,1,2,True
...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,0,47.0,1,1,52.5542,1,1,3,False
872,0,1,1,33.0,0,0,5.0000,1,0,1,True
879,1,1,0,56.0,0,1,83.1583,0,1,2,False
887,1,1,0,19.0,0,0,30.0000,1,1,1,True


In [181]:
#Convert float data to integers 
titanic1.info()
titanic1.age = titanic1.age.astype(int)
titanic1.fare = titanic1.fare.astype(int)
titanic1.deck = titanic1.deck.astype(int)
titanic1.embarked = titanic1.embarked.astype(int)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182 entries, 1 to 889
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   survived  182 non-null    int64
 1   pclass    182 non-null    int64
 2   sex       182 non-null    int64
 3   age       182 non-null    int32
 4   sibsp     182 non-null    int64
 5   parch     182 non-null    int64
 6   fare      182 non-null    int32
 7   embarked  182 non-null    int32
 8   who       182 non-null    int64
 9   deck      182 non-null    int32
 10  alone     182 non-null    bool 
dtypes: bool(1), int32(4), int64(6)
memory usage: 13.0 KB


In [184]:
#confusion matrix
x= titanic1[['sex', 'pclass', 'who', 'sibsp', 'parch', 'age','deck', 'fare','alone']]
y = titanic1['survived']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = .4)

Dtree = DecisionTreeClassifier()
Dtree.fit(x_train, y_train)
pred= Dtree.predict(x_test)
print(confusion_matrix(y_test, pred))

[[17  4]
 [ 7 45]]


In [185]:
#classification report
print(classification_report(y_test, pred))
#assuming "0" is did not survive and "1" is survived, 
#it seems the model did okay predicting the people that didn't survive, but did much better at predicting those that did!
#over all the model can predict with 86% accuracy, which is pretty high in my eyes.

              precision    recall  f1-score   support

           0       0.71      0.81      0.76        21
           1       0.92      0.87      0.89        52

    accuracy                           0.85        73
   macro avg       0.81      0.84      0.82        73
weighted avg       0.86      0.85      0.85        73



In [188]:
###random forest
forest = RandomForestClassifier(n_estimators=75)
forest.fit(x_train, y_train)
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))
#running the random forest made the model a bit more accurate on average and for predicted non survivors. 
#i think the created model is efficient at predicting survival and non survival.

[[17  4]
 [ 6 46]]
              precision    recall  f1-score   support

           0       0.74      0.81      0.77        21
           1       0.92      0.88      0.90        52

    accuracy                           0.86        73
   macro avg       0.83      0.85      0.84        73
weighted avg       0.87      0.86      0.86        73

