### Assignment 18: Decision Tree Classifier

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

##### Load the dataset

In [3]:
Url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'

In [4]:
data = pd.read_csv(Url)

##### About the Dataset

In [5]:
print('Dimension of the dataset:')
data.shape

Dimension of the dataset:


(891, 12)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Conversion from int type to object type

data['Survived'] = data['Survived'].astype('str')
data['Pclass'] = data['Pclass'].astype('str')

In [10]:
# Columns needed for analysis

cols = ['Pclass','Sex','Age','SibSp','Parch','Fare']

##### Missing Values in dataset

In [11]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
# Extracting the title of the passenger from name

f = lambda x: x.partition(',')[-1].split()[0]
data['Initial'] = data['Name'].apply(f).str[:-1]

In [13]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Initial
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [14]:
data.groupby('Initial')['Age'].mean()

Initial
Capt        70.000000
Col         58.000000
Don         40.000000
Dr          42.000000
Jonkheer    38.000000
Lady        48.000000
Major       48.500000
Master       4.574167
Miss        21.773973
Mlle        24.000000
Mme         24.000000
Mr          32.368090
Mrs         35.898148
Ms          28.000000
Rev         43.166667
Sir         49.000000
th          33.000000
Name: Age, dtype: float64

In [15]:
# Imputing Age based on the title

data['Age'] = data.groupby('Initial')['Age'].transform(lambda x: x.fillna(x.mean()))

In [16]:
data['Age'].isnull().sum()

0

In [17]:
# Data after imputation

df = data[cols +['Survived']]

In [18]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,22.0,1,0,7.25,0
1,1,female,38.0,1,0,71.2833,1
2,3,female,26.0,0,0,7.925,1
3,1,female,35.0,1,0,53.1,1
4,3,male,35.0,0,0,8.05,0


##### Scalaing of numeric features Age and Fare

In [19]:
scale_cols = ['Age','Fare']

scaler = StandardScaler()

scaler.fit(df[scale_cols])
df[scale_cols] = scaler.transform(df[scale_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [20]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Survived
0,3,male,-0.584387,1,0,-0.502445,0
1,1,female,0.621365,1,0,0.786845,1
2,3,female,-0.282949,0,0,-0.488854,1
3,1,female,0.395286,1,0,0.42073,1
4,3,male,0.395286,0,0,-0.486337,0


##### Encoding of Categorical Features

In [21]:
encode_cols = ['Pclass','Sex','SibSp','Parch']
df_dummies = pd.get_dummies(df, columns=encode_cols,drop_first=True)
df = df_dummies

In [22]:
df.head()

Unnamed: 0,Age,Fare,Survived,Pclass_2,Pclass_3,Sex_male,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6
0,-0.584387,-0.502445,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,0.621365,0.786845,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,-0.282949,-0.488854,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.395286,0.42073,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0.395286,-0.486337,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0


##### Model Building: - Decision Tree Classifier

In [23]:
# Partitioning of Predictor and Target features

X = df.loc[:,~df.columns.isin(['Survived'])]
y = df.loc[:,'Survived']

In [24]:
# Splitting dataset into train and test dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.20)

In [25]:
# Decision Tree Classifier

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [26]:
# Prediction using test dataset

pred = model.predict(X_test)

In [27]:
##### Classification Report:- Performance Metrics

In [28]:
target_names = ['Not Survived', 'Survived']
print(classification_report(y_test, pred, target_names = target_names))

              precision    recall  f1-score   support

Not Survived       0.80      0.87      0.83       110
    Survived       0.76      0.65      0.70        69

    accuracy                           0.79       179
   macro avg       0.78      0.76      0.77       179
weighted avg       0.79      0.79      0.78       179

