In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [23]:
adv = pd.read_csv('../../datasets/advertising.csv')
adv.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [3]:
adv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [11]:
adv.isna().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [24]:
adv = (
    adv
    .drop(columns = ['Timestamp', 'Ad Topic Line'])
)

In [26]:
adv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   City                      1000 non-null   object 
 5   Male                      1000 non-null   int64  
 6   Country                   1000 non-null   object 
 7   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 62.6+ KB


In [28]:
adv = pd.get_dummies(adv, columns = ['Country', 'City'])

In [31]:
adv.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,Country_Afghanistan,Country_Albania,Country_Algeria,Country_American Samoa,...,City_Wintersfort,City_Wongland,City_Wrightburgh,City_Wrightview,City_Yangside,City_Youngburgh,City_Youngfort,City_Yuton,City_Zacharystad,City_Zacharyton
0,68.95,35,61833.9,256.09,0,0,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,80.23,31,68441.85,193.77,1,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,69.47,26,59785.94,236.5,0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,74.15,29,54806.18,245.89,1,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,68.37,35,73889.99,225.58,0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
X = adv.drop(columns = ['Clicked on Ad'])
y = adv['Clicked on Ad']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10, shuffle = True)

In [34]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [35]:
model_predict = model.predict(X_test)
print(confusion_matrix(y_test, model_predict))
print(classification_report(y_test, model_predict))

[[138   8]
 [  9 145]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       146
           1       0.95      0.94      0.94       154

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300



In [73]:
param_grid = {
    'max_depth': [10, 12, 15],
    # 'min_samples_leaf': [3, 5, 10],
    # 'min_samples_split': [5, 10, 20],
    'ccp_alpha': [0.000001, 0.00001, 0.0001]
    
}
search_model = GridSearchCV(model, param_grid=param_grid)
search_model.fit(X_train, y_train)
search_model.best_params_

{'ccp_alpha': 1e-06, 'max_depth': 10}

In [74]:
model_predict = search_model.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, model_predict))
print(classification_report(y_test, model_predict))

[[138   8]
 [  8 146]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       146
           1       0.95      0.95      0.95       154

    accuracy                           0.95       300
   macro avg       0.95      0.95      0.95       300
weighted avg       0.95      0.95      0.95       300

