In [217]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## Reading the preprocessed data

We read in the titanic csv file that we created earlier

In [218]:
df = pd.read_csv('Titanic_train_numeric.csv')

In [219]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,1,0,3,1,22.0,1,0,7.25,2
1,1,2,1,1,2,38.0,1,0,71.2833,1
2,2,3,1,3,2,26.0,0,0,7.925,2
3,3,4,1,1,2,35.0,1,0,53.1,2
4,4,5,0,3,1,35.0,0,0,8.05,2


In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Unnamed: 0     891 non-null int64
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       891 non-null int64
dtypes: float64(2), int64(8)
memory usage: 69.7 KB


In [221]:
train_df = df.drop(['Survived', 'Unnamed: 0'], axis=1)

In [222]:
train_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,7.25,2
1,2,1,2,38.0,1,0,71.2833,1
2,3,3,2,26.0,0,0,7.925,2
3,4,1,2,35.0,1,0,53.1,2
4,5,3,1,35.0,0,0,8.05,2


In [223]:
# test_df = df[['Survived']] # If we need a dataframe
test_df = df['Survived']

In [224]:
test_df.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

## Splitting the dataset in training, valiation and testing data sets

In [225]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(train_df, test_df, test_size = .10,  random_state=10)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = .10, random_state=10)

## Initializing the Decision Tree Classifier

random_state -> seeding

In [226]:
decision_tree = DecisionTreeClassifier(random_state=4)

# A. Training the model using training set 

In [227]:
decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=4,
            splitter='best')

# B. Finding the right parameters using validation set

In [228]:
Y_val_pred = decision_tree.predict(X_val)

In [229]:
Y_val_pred

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0])

#### Measuring the performance (Using accuracy)

In [230]:
decision_tree.score(X_val, Y_val)

0.7407407407407407

#### Changing the paramters to increase the performance

In [231]:
decision_tree_param = DecisionTreeClassifier(min_samples_split=4, min_samples_leaf=10, random_state=4)

In [232]:
decision_tree_param.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=4,
            splitter='best')

In [233]:
decision_tree_param.score(X_val, Y_val)

0.7530864197530864

In [234]:
Y_pred = decision_tree_param.predict(X_test)

In [235]:
Y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1])

In [236]:
from sklearn.tree import export_graphviz

In [237]:
dot_data = export_graphviz(decision_tree_param, out_file="decision.dot")

![Decision Tree](tree.png)

# C. Performance reporting using test set

In [239]:
decision_tree_param.score(X_test, Y_test)

0.8333333333333334