## decision_tree classifier with GridSearchCV

In [37]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [2]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [3]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [60]:
# Check dataset is balanced or Imbalanced
dataset["Purchased"].value_counts()    # imbalanced 

Purchased
0    257
1    143
Name: count, dtype: int64

In [4]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [6]:
y_train = y_train.values.ravel()

In [14]:
# model creation + grid cv
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier 

param_grid = {
"criterion": ["gini", "entropy", "log_loss"],
"splitter" : ["best", "random"],
"max_features" : ["sqrt", "log2"], 
}
model = DecisionTreeClassifier()

# verbose=3 - detailed info
# n_jobs = -1 - all core  OR  1 - one cpu core
grid_model = GridSearchCV(model, param_grid, refit=True, verbose=3  ,  n_jobs =1,scoring='f1_weighted') 
grid_model.fit(x_train,y_train)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END criterion=gini, max_features=sqrt, splitter=best;, score=0.860 total time=   0.0s
[CV 2/5] END criterion=gini, max_features=sqrt, splitter=best;, score=0.773 total time=   0.0s
[CV 3/5] END criterion=gini, max_features=sqrt, splitter=best;, score=0.876 total time=   0.0s
[CV 4/5] END criterion=gini, max_features=sqrt, splitter=best;, score=0.892 total time=   0.0s
[CV 5/5] END criterion=gini, max_features=sqrt, splitter=best;, score=0.904 total time=   0.0s
[CV 1/5] END criterion=gini, max_features=sqrt, splitter=random;, score=0.809 total time=   0.0s
[CV 2/5] END criterion=gini, max_features=sqrt, splitter=random;, score=0.842 total time=   0.0s
[CV 3/5] END criterion=gini, max_features=sqrt, splitter=random;, score=0.860 total time=   0.0s
[CV 4/5] END criterion=gini, max_features=sqrt, splitter=random;, score=0.874 total time=   0.0s
[CV 5/5] END criterion=gini, max_features=sqrt, splitter=random;, score=0.89

In [15]:
# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
grid_prediction

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

In [18]:
# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print(cm)
print(clf_report)

"""in scikit learn
[[TN  FP
  FN  TP]]
"""

[[55  3]
 [ 4 18]]
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        58
           1       0.86      0.82      0.84        22

    accuracy                           0.91        80
   macro avg       0.89      0.88      0.89        80
weighted avg       0.91      0.91      0.91        80



'in scikit learn\n[[TN  FP\n  FN  TP]]\n'

## ROC AUC 

In [19]:
print(grid_model.predict_proba(x_test)[:,1])

[0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0.
 0. 1. 0. 0. 1. 0. 1. 0.]


In [20]:
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
roc_auc_score

np.float64(0.8832288401253919)

## f1 score 

In [21]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
f1_score

0.9118564897634664

## GridSearchCV results 

In [22]:
grid_results = grid_model.cv_results_
print(grid_results)

{'mean_fit_time': array([0.00737786, 0.00703855, 0.00761924, 0.01099277, 0.00843797,
       0.00654731, 0.00681701, 0.0073029 , 0.00871477, 0.01023445,
       0.00782232, 0.00747552]), 'std_fit_time': array([0.00175318, 0.00104639, 0.00246017, 0.00485435, 0.00173311,
       0.00030157, 0.00060309, 0.00131873, 0.00211859, 0.00300648,
       0.0008646 , 0.00077895]), 'mean_score_time': array([0.03152237, 0.02734585, 0.03068714, 0.03324509, 0.02454543,
       0.02089448, 0.02237873, 0.02343588, 0.02636242, 0.03702264,
       0.02656355, 0.02282157]), 'std_score_time': array([0.01248979, 0.0106505 , 0.00563247, 0.00722809, 0.00274929,
       0.00091322, 0.00320255, 0.00383286, 0.00510582, 0.00779986,
       0.00614282, 0.00202962]), 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'gini', 'entropy', 'entropy',
                   'entropy', 'entropy', 'log_loss', 'log_loss',
                   'log_loss', 'log_loss'],
             mask=[False, False, False, False, False, False,

In [23]:
# table
table = pd.DataFrame.from_dict(grid_results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007378,0.001753,0.031522,0.01249,gini,sqrt,best,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.859841,0.773256,0.875897,0.892138,0.904221,0.861071,0.04639,1
1,0.007039,0.001046,0.027346,0.01065,gini,sqrt,random,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.808921,0.842259,0.85992,0.873807,0.891049,0.855191,0.028153,5
2,0.007619,0.00246,0.030687,0.005632,gini,log2,best,"{'criterion': 'gini', 'max_features': 'log2', ...",0.828694,0.84375,0.799686,0.861321,0.823187,0.831328,0.020632,10
3,0.010993,0.004854,0.033245,0.007228,gini,log2,random,"{'criterion': 'gini', 'max_features': 'log2', ...",0.798561,0.79104,0.766533,0.81071,0.922178,0.817804,0.054147,12
4,0.008438,0.001733,0.024545,0.002749,entropy,sqrt,best,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.844699,0.8125,0.828791,0.922651,0.873807,0.85649,0.038778,2
5,0.006547,0.000302,0.020894,0.000913,entropy,sqrt,random,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.8125,0.840368,0.829832,0.827357,0.891049,0.840221,0.026927,8
6,0.006817,0.000603,0.022379,0.003203,entropy,log2,best,"{'criterion': 'entropy', 'max_features': 'log2...",0.797547,0.825502,0.876877,0.891049,0.890137,0.856222,0.037884,3
7,0.007303,0.001319,0.023436,0.003833,entropy,log2,random,"{'criterion': 'entropy', 'max_features': 'log2...",0.828694,0.840368,0.844872,0.798892,0.840368,0.830639,0.016754,11
8,0.008715,0.002119,0.026362,0.005106,log_loss,sqrt,best,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.813639,0.805648,0.891711,0.892138,0.873807,0.855389,0.038017,4
9,0.010234,0.003006,0.037023,0.0078,log_loss,sqrt,random,"{'criterion': 'log_loss', 'max_features': 'sqr...",0.78344,0.808442,0.845648,0.876518,0.873807,0.837571,0.036535,9


In [24]:
grid_model.best_params_

{'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'best'}

In [25]:
grid_model.best_estimator_

In [26]:
""" Mean cross-validation score on the TRAINING data """
grid_model.best_score_

np.float64(0.8610705841943032)

## model prediction with real time data

In [58]:
## user input 
Gender=input("Enter your gender (0-female/1-male): ")
Age	= int(input("Enter your age: "))
EstimatedSalary = int(input("Enter your estimated salary: "))


""" test input data's
Male	19	19000	- 0
Female	46	41000	- 1
Female	50	20000	- 1
Male	36	33000	- 0
"""

Enter your gender (0-female/1-male):  0
Enter your age:  52
Enter your estimated salary:  60000


" test input data's\nMale\t19\t19000\t- 0\nFemale\t46\t41000\t- 1\nFemale\t50\t20000\t- 1\nMale\t36\t33000\t- 0\n"

In [59]:
## user input predction 
user_input_prediction = grid_model.predict([[Gender,Age,EstimatedSalary ]])
if user_input_prediction==1:
    print("user will purchase")
else:
    print("user will not purchase")

print(user_input_prediction)

user will purchase
[1]
