# Example Model Validation Strategy and Evaluation Metrics

In [1]:


import pandas as pd
import numpy as np

In [45]:
#Data collection

train = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/Stat2Data/Titanic.csv", index_col=0).dropna()

In [46]:
train.head()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1,1
2,"Allison, Miss Helen Loraine",1st,2.0,female,0,1
3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0,0
4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0,1
5,"Allison, Master Hudson Trevor",1st,0.92,male,1,0


In [None]:
#We are trying to predict if a passenger survived: 1: survived 0: not survived

In [47]:
train.shape

(756, 6)

In [None]:
#Data Cleaning (When necessary to remove NaNs)

In [48]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 756 entries, 1 to 1313
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      756 non-null    object 
 1   PClass    756 non-null    object 
 2   Age       756 non-null    float64
 3   Sex       756 non-null    object 
 4   Survived  756 non-null    int64  
 5   SexCode   756 non-null    int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 41.3+ KB


In [None]:
#Drop unnecessary variables

In [50]:
train.drop(labels = ['Name','SexCode'], axis = 1, inplace=True)

In [51]:
train.head()

Unnamed: 0,PClass,Age,Sex,Survived
1,1st,29.0,female,1
2,1st,2.0,female,0
3,1st,30.0,male,0
4,1st,25.0,female,0
5,1st,0.92,male,1


In [52]:
#Label encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()


In [53]:
#Encode PClass

train['PClass'] = le.fit_transform(train['PClass'])


In [54]:
#Encode Sex

train['Sex'] = le.fit_transform(train['Sex'])

In [55]:
#Check that features PClass and Sex have been encoded

train.head()

Unnamed: 0,PClass,Age,Sex,Survived
1,0,29.0,0,1
2,0,2.0,0,0
3,0,30.0,1,0
4,0,25.0,0,0
5,0,0.92,1,1


In [56]:
X_train = train.loc[:, train.columns !='Survived'].values

In [57]:
y_train = train.loc[:, 'Survived']

In [58]:
y_train

1       1
2       0
3       0
4       0
5       1
       ..
1309    0
1310    0
1311    0
1312    0
1313    0
Name: Survived, Length: 756, dtype: int64

In [59]:
from sklearn.model_selection import train_test_split

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.20, random_state = 42)

In [61]:
from sklearn.tree import DecisionTreeClassifier

In [62]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [63]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [65]:
#Model prediction with Decision Tree

y_pred = dtc.predict(X_test)


In [66]:
y_pred

array([0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0],
      dtype=int64)

In [67]:
from sklearn.metrics import accuracy_score

In [70]:
#Obtain accuracy metric

accuracy_score(y_test, y_pred)

0.7828947368421053

In [71]:
#Obtain accuracy metric

accuracy_score(y_test, y_pred)*100

78.28947368421053

In [None]:
#We can try to increase the accuracy_score with hyper parameter tuning