In [187]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

In [188]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')
gender_data = pd.read_csv('gender_submission.csv')

In [189]:
train_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Cunningham, Mr. Alfred Fleming",male,,,,1601.0,,C23 C25 C27,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [190]:
gender_data.describe(include='all')

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.363636
std,120.810458,0.481622
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [191]:
# print data info, note significant gaps in the cabin then age NaN entries.
print('\nTrain Data\n')
train_data.info()

# print data info, note significant gaps in the cabin then age NaN entries.
print('\nTest Data\n')
test_data.info()


Train Data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

Test Data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare    

In [192]:
dataset =  pd.concat(objs=[train_data, test_data], axis=0).reset_index(drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [193]:
dataset = dataset.fillna(np.nan)
dataset.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
Ticket            0
dtype: int64

In [219]:
dataset['Fare'] = dataset['Fare'].fillna(dataset["Fare"].median())
dataset.drop(['Cabin'], axis=1, inplace=True)
dataset['Fare'].fillna(method ='ffill', inplace = True)
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

KeyError: "['Cabin'] not found in axis"

In [200]:
dataset.columns

Index(['Age', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass',
       'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

In [201]:
# Set dummy Values with prefix to referance back to the features.
dataset = pd.get_dummies(dataset, columns = ["Ticket"], prefix="T")
dataset = pd.get_dummies(dataset, columns = ['Name'])
dataset = pd.get_dummies(dataset, columns = ['Embarked'], prefix='Em')

In [202]:
# Create categorical values
dataset["Pclass"] = dataset["Pclass"].astype("category")
dataset = pd.get_dummies(dataset, columns = ["Pclass"],prefix="Pc")

In [205]:
y_train = pd.DataFrame(dataset['Survived'], columns=['Survived'])
X_train = dataset
X_train.drop('Survived', axis=1, inplace=True)

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [207]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1047, 2248)
(262, 2248)
(1047, 1)
(262, 1)


True

In [208]:
acc_scores = pd.DataFrame(columns=['Model','Accuracy'])

In [209]:
# perform random forest with tuned paramters. 
rfparm_grid = {'min_samples_split': 2,
                     'min_samples_leaf': 5,
                     'max_leaf_nodes': None,
                     'max_depth': 5,
                     'criterion': 'gini'}

In [210]:
# perform random forest  for best hyperparameters
rf_clf = RandomForestClassifier(**rfparm_grid)
rf_clf.fit(X_train, y_train)
pred_rf = rf_clf.predict(X_test)
acc_rf = accuracy_score(y_test, pred_rf)
acc_scores = acc_scores.append({'Model': 'Random Forest(Tune)', 'Accuracy':acc_rf}, ignore_index=True)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [50]:
# Decision Tree Grid with tuned paramters.
dtr_grid = {'min_samples_split': 2,
             'min_samples_leaf': 5,
             'max_leaf_nodes': None,
             'max_depth': 5,
             'criterion': 'gini'}

In [51]:
# Decision Tree to search for best hyperparameters
dt_clf = DecisionTreeClassifier(**dtr_grid)
dt_clf.fit(X_train, y_train)
pred_dt = dt_clf.predict(X_test)
acc_dt = accuracy_score(y_test, pred_dt)
acc_scores = acc_scores.append({'Model': 'Decision Tree(Tune)', 'Accuracy':acc_dt}, ignore_index=True)

In [52]:
acc_scores.sort_values(by='Accuracy', ascending=False)

Unnamed: 0,Model,Accuracy
1,Decision Tree(Tune),0.815642
0,Random Forest(Tune),0.586592


NameError: name 'dtr_clf' is not defined