In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
adv = pd.read_csv('../../datasets/advertising.csv')
adv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Daily Time Spent on Site  1000 non-null   float64
 1   Age                       1000 non-null   int64  
 2   Area Income               1000 non-null   float64
 3   Daily Internet Usage      1000 non-null   float64
 4   Ad Topic Line             1000 non-null   object 
 5   City                      1000 non-null   object 
 6   Male                      1000 non-null   int64  
 7   Country                   1000 non-null   object 
 8   Timestamp                 1000 non-null   object 
 9   Clicked on Ad             1000 non-null   int64  
dtypes: float64(3), int64(3), object(4)
memory usage: 78.3+ KB


In [3]:
adv.isna().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Male                        0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [5]:
adv = (
    adv
    .drop(columns = ['Timestamp', 'Ad Topic Line'])
)

In [7]:
adv = pd.get_dummies(adv, columns = ['Country', 'City'])

In [8]:
X = adv.drop(columns = ['Clicked on Ad'])
y = adv['Clicked on Ad']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 13, shuffle = True)

In [11]:
model = GradientBoostingClassifier(
    n_estimators=250,
    learning_rate=0.1,
    max_depth=5,
    min_samples_split=4,
    min_samples_leaf=6,
    max_features=0.6,
    loss='log_loss'
)
model.fit(X_train, y_train)

In [12]:
y_predict = model.predict(X_test)

In [13]:
print(confusion_matrix(y_test, y_predict))

[[142   2]
 [  9 147]]


In [14]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       144
           1       0.99      0.94      0.96       156

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300



## With GridSearchCV

In [38]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [200],
    'learning_rate':[0.01],
    'max_depth':[5],
    'min_samples_split':[4],
    'min_samples_leaf':[6],
    'max_features':[0.6],
}
boost_search = GridSearchCV(model, param_grid=param_grid)
boost_search.fit(X_train, y_train)
print(boost_search.best_params_)

{'learning_rate': 0.01, 'max_depth': 5, 'max_features': 0.6, 'min_samples_leaf': 6, 'min_samples_split': 4, 'n_estimators': 200}


In [39]:
y_predict = boost_search.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(classification_report(y_test, y_predict))

[[140   4]
 [  9 147]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       144
           1       0.97      0.94      0.96       156

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300

