# Prediction of Survival in Titanic#
## There are 2 different classes -> Binary-class classification problem ##

In [4]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, plot_confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
my_path = '/content/drive/MyDrive/Colab Notebooks/CS512/titanic.csv'

In [13]:
df = pd.read_csv(my_path)

## Examine Data##


In [14]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### <font color=green>Eliminate Some Features: Name, PassengerId,Ticket, Cabin</font> ###

In [15]:
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


### <font color=red>Check Null values in dataframe</font> ###

- <font color=green>*If there are any null values in dataframe, delete entire row*</font> 



In [16]:
df.isnull().values.any()

True

In [17]:
df = df.dropna()

### <font color=green>Represent String Features as Numerical: Sex and Embarked</font> ###
- *We know that there are 3 kinds of Embarked: C: Cherbourg, Q: Queenstown, S: Southampton*
- *We can map them: C=0, Q=1, S=2*
- *We know that there are 2 kinds of Sex: male, female*
- *We can map them: male=0, female=1*

In [18]:
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1}).astype(int)
df['Embarked'] = df['Embarked'].map({'C':0,'Q':1,'S':2}).astype(int)

In [19]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


## <font color=green>Creating Feature Vectors</font> ##

 - <font color=red>*We will use all features except survived*</font>
 - <font color=red>*Survived is our label vector*</font>
 - <font color=red>*After vectorization shuffle the data*</font>


In [20]:
X = df.drop(['Survived'], axis = 1).values
y = df['Survived']

## <font color=green>Split Data to Extract Test Samples</font> ##

In [22]:
from sklearn.utils import shuffle
X, y = shuffle(X,y,random_state=5)

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 9)

## <font color=green>Create Decision Tree</font> ##
- *You can find detailed explanation of model and parameters in: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html*

In [29]:
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train,y_train)

y_pred = classifier.predict(X_test)

test_acc = accuracy_score(y_test,y_pred)
test_acc

0.7202797202797203

In [30]:
from sklearn.model_selection import cross_val_score
allScore = []
dt = DecisionTreeClassifier(random_state=0)
scores= cross_val_score(dt,X,y,cv=5,scoring='accuracy') #we gave entire data.

scores.mean()

0.7542204274598641

### <font color=green>Fine-Tuning</font> ###
- *max_depth: default=None -> If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples*
- *min_samples_split: default=2 -> The minimum number of samples required to split an internal node*
- *min_samples_leaf: default=1 -> The minimum number of samples required to be at a leaf node*
- *max_features: default None -> if None, then max_features=n_features*
- *criterion: {“gini”, “entropy”}, default=”gini” -> The function to measure the quality of a split*

### <font color=green>By using Grid-Search</font> ###            

In [31]:
from sklearn.model_selection import GridSearchCV

dt = DecisionTreeClassifier(random_state = 0)

tuning_parameters = {'max_depth': list(range(1,8)),
                     'max_features': list(range(1,7)),
                     'min_samples_split': np.linspace(0.1,1.0,10,endpoint=True),
                     'min_samples_leaf':list(range(1,7)),
                     'criterion': ['gini', 'entropy']}


grid = GridSearchCV(dt, param_grid = tuning_parameters, scoring = 'accuracy', cv = 5)



In [32]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7],
                         'max_features': [1, 2, 3, 4, 5, 6],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
             scoring='accuracy')

In [33]:
print(grid.best_params_)
print('Vall acc:', grid.best_score_)

{'criterion': 'gini', 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 0.2}
Vall acc: 0.8048439683278994


In [35]:
y_pred = grid.predict(X_test)
test_acc = accuracy_score(y_test,y_pred)
test_acc

0.8251748251748252

## <font color=green>Let's Calculate Accuracy by 5-fold CV</font> ##
- *We will use hyperparameters that we found*
- *We can use entire data now*
- *Then we can report mean accuracy and observe std*

In [34]:
dt = DecisionTreeClassifier(random_state=0)
dt.set_params(**grid.best_params_)

scores = cross_val_score(dt,X,y,cv=5,scoring='accuracy')
scores.mean()

0.7851472471190781