### Decision Tree

##### Importing necessary packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

%matplotlib inline

##### Reading the data

In [4]:
sn_data = pd.read_csv("./Datasets/Social_Network_Ads.csv")

In [5]:
sn_data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


##### Separating the independant variables and dependant variable

In [6]:
X = sn_data.iloc[:,1:-1].values   # independant variables/estimators/predictors
y = sn_data.iloc[:,-1].values     # dependant variable

In [7]:
X

array([['Male', 19, 19000],
       ['Male', 35, 20000],
       ['Female', 26, 43000],
       ...,
       ['Female', 50, 20000],
       ['Male', 36, 33000],
       ['Female', 49, 36000]], dtype=object)

In [8]:
y

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,

##### Encoding categorical values

In [9]:
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [10]:
X

array([[0.0e+00, 1.0e+00, 1.9e+01, 1.9e+04],
       [0.0e+00, 1.0e+00, 3.5e+01, 2.0e+04],
       [1.0e+00, 0.0e+00, 2.6e+01, 4.3e+04],
       ...,
       [1.0e+00, 0.0e+00, 5.0e+01, 2.0e+04],
       [0.0e+00, 1.0e+00, 3.6e+01, 3.3e+04],
       [1.0e+00, 0.0e+00, 4.9e+01, 3.6e+04]])

##### Avoiding the dummy variable trap

In [11]:
X = X[:,1:]
X

array([[1.0e+00, 1.9e+01, 1.9e+04],
       [1.0e+00, 3.5e+01, 2.0e+04],
       [0.0e+00, 2.6e+01, 4.3e+04],
       ...,
       [0.0e+00, 5.0e+01, 2.0e+04],
       [1.0e+00, 3.6e+01, 3.3e+04],
       [0.0e+00, 4.9e+01, 3.6e+04]])

##### Splitting the data into train and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [13]:
print("X_train:");print(X_train)
print("\n\nX_test:");print(X_test)
print("\n\ny_train:");print(y_train)
print("\n\ny_test:");print(y_test)

X_train:
[[0.00e+00 2.70e+01 5.70e+04]
 [1.00e+00 4.60e+01 2.80e+04]
 [0.00e+00 3.90e+01 1.34e+05]
 [0.00e+00 4.40e+01 3.90e+04]
 [0.00e+00 5.70e+01 2.60e+04]
 [1.00e+00 3.20e+01 1.20e+05]
 [1.00e+00 4.10e+01 5.20e+04]
 [1.00e+00 4.80e+01 7.40e+04]
 [1.00e+00 2.60e+01 8.60e+04]
 [1.00e+00 2.20e+01 8.10e+04]
 [1.00e+00 4.90e+01 8.60e+04]
 [0.00e+00 3.60e+01 5.40e+04]
 [1.00e+00 4.00e+01 5.90e+04]
 [0.00e+00 4.10e+01 8.00e+04]
 [1.00e+00 2.60e+01 1.60e+04]
 [0.00e+00 3.90e+01 7.90e+04]
 [1.00e+00 5.90e+01 1.30e+05]
 [1.00e+00 4.20e+01 6.40e+04]
 [0.00e+00 5.30e+01 1.43e+05]
 [1.00e+00 3.40e+01 1.12e+05]
 [0.00e+00 5.70e+01 1.22e+05]
 [0.00e+00 3.90e+01 7.10e+04]
 [1.00e+00 4.70e+01 2.50e+04]
 [1.00e+00 2.40e+01 1.90e+04]
 [0.00e+00 3.60e+01 5.00e+04]
 [0.00e+00 3.20e+01 1.50e+05]
 [0.00e+00 4.80e+01 2.90e+04]
 [1.00e+00 3.00e+01 1.07e+05]
 [1.00e+00 6.00e+01 3.40e+04]
 [1.00e+00 3.80e+01 6.10e+04]
 [1.00e+00 3.30e+01 3.10e+04]
 [1.00e+00 3.90e+01 7.10e+04]
 [1.00e+00 5.50e+01 3.90e+04]
 

##### Feature Scaling

In [14]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

##### Modeling our classifier

In [15]:
tree_classifier = DecisionTreeClassifier(criterion = 'gini', max_depth = 4, splitter = 'best', random_state = 42)
tree_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

##### Predicting target values

In [16]:
y_pred = tree_classifier.predict(X_test)
y_pred

array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0], dtype=int64)

##### Checking the performance of our model

In [17]:
print("Classification Report:");print(metrics.classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        52
           1       0.86      0.89      0.88        28

    accuracy                           0.91        80
   macro avg       0.90      0.91      0.90        80
weighted avg       0.91      0.91      0.91        80



In [18]:
print("Confusion Matrix:");print(metrics.confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[48  4]
 [ 3 25]]


In [19]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9125


In [21]:
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot

In [25]:
features = list(sn_data.columns[1:])
features

['Gender', 'Age', 'EstimatedSalary', 'Purchased']

In [24]:
dot_data = StringIO()
export_graphviz(tree_classifier, out_file = dot_data, feature_names = features, filled = True, rounded = True)

ValueError: Length of feature_names, 4 does not match number of features, 3