# Prepping data

In [87]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

addata = pd.read_csv('ad_click_dataset.csv')
addata = addata.drop(['id', 'full_name'], axis=1)

####### Imputing numerical feature ######
# Edit: because feature importance seems to be REALLY high for the age feature, going to try out different imputations 

# # Simple imputer
# num_imputer = SimpleImputer(strategy = 'median')
# addata['age'] = num_imputer.fit_transform(addata[['age']])

# # Aribtrary value imputation 
# addata.fillna({'age': -999}, inplace=True)

# Multiple Imputation (miceforest)
import miceforest as mf 
kernel = mf.ImputationKernel(
    data = addata, 
    save_all_iterations_data=True, 
    random_state=42
)
kernel.mice(iterations=2)
addata = kernel.complete_data()

# End Tail Imputation 
from feature_engine.imputation import EndTailImputer
end_tail_imputer = EndTailImputer(imputation_method='gaussian', tail='right', fold=3, variables=['age'])
end_tail_imputer.fit(addata)
addata = end_tail_imputer.transform(addata)

###### Imputing categorical features ######
cat_cols = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
cat_imputer = SimpleImputer(strategy = 'most_frequent')
addata[cat_cols] = cat_imputer.fit_transform(addata[cat_cols])

# Encoding categorical variables 
addata = pd.get_dummies(addata, columns = cat_cols, drop_first= True)
display(addata.head())

# Splitting data 
X = addata.drop('click', axis=1)
y = addata['click']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2) 

AssertionError: convert object dtypes to something else

In [88]:
addata.dtypes

age                 float64
gender               object
device_type          object
ad_position          object
browsing_history     object
time_of_day          object
click                 int64
dtype: object

## Random Forest

In [80]:
from sklearn.ensemble import RandomForestClassifier

############## Random forest model 
rf = RandomForestClassifier(random_state=2)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nAUC-ROC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

Random Forest Accuracy: 0.723

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.66      0.45      0.53       707
           1       0.74      0.87      0.80      1293

    accuracy                           0.72      2000
   macro avg       0.70      0.66      0.67      2000
weighted avg       0.71      0.72      0.71      2000

Confusion Matrix:
 [[ 315  392]
 [ 162 1131]]

AUC-ROC: 0.7524697779688476


## XGBoost

In [81]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

############### XGBoost algorithm 
XGBoost_model = XGBClassifier(
    object='binary:logistic', 
    random_state=2, 
    n_estimators=200, 
    max_depth=7,
    learning_rate=0.2,
    subsample=0.8, 
    colample_bytree=0.8,
    scale_pos_weight= float(len(y_train[y_train == 0]) / len(y_train[y_train == 1]))
)

XGBoost_model.fit(X_train, y_train)
y_pred_xgboost = XGBoost_model.predict(X_test)

print("\nXGBoost Accuracy:", accuracy_score(y_test, y_pred_xgboost))
print("\nClassification Report (XGBoost):\n", classification_report(y_test, y_pred_xgboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgboost))
print("\nAUC-ROC:", roc_auc_score(y_test, XGBoost_model.predict_proba(X_test)[:, 1]))

#### Using gridsearchcv for best xgboost parameters 
# param_grid = {
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'n_estimators': [50, 100, 200]
# }

# grid = GridSearchCV(XGBClassifier(objective='binary:logistic', random_state=2), param_grid, cv=5, n_jobs=-1)
# grid.fit(X_train, y_train)
# print("Best parameters:", grid.best_params_)

Parameters: { "colample_bytree", "object" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost Accuracy: 0.718

Classification Report (XGBoost):
               precision    recall  f1-score   support

           0       0.60      0.62      0.61       707
           1       0.79      0.77      0.78      1293

    accuracy                           0.72      2000
   macro avg       0.69      0.70      0.69      2000
weighted avg       0.72      0.72      0.72      2000

Confusion Matrix:
 [[440 267]
 [297 996]]

AUC-ROC: 0.7744349675272466


# Decision Tree Classifier

In [82]:
######## Decison Tree Classifier 
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = {classes[i]: weights[i] for i in range(len(classes))}

DTC_model = DecisionTreeClassifier(
    criterion = 'entropy', 
    splitter = 'best', 
    max_depth = None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    class_weight=None, 
    random_state=42
)

DTC_model.fit(X_train, y_train)
y_pred_DTC = DTC_model.predict(X_test)

print("\nDecision Tree Classifier Accuracy:", accuracy_score(y_test, y_pred_DTC))
print("\nClassification Report (Decision Tree Classifier):\n", classification_report(y_test, y_pred_DTC))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_DTC))
print("\nAUC-ROC:", roc_auc_score(y_test, DTC_model.predict_proba(X_test)[:, 1]))

# ########## Again, using gridsearchcv for parameter tuning 
# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [3, 5, 7, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'class_weight': ['balanced', class_weights, None]
# }

# grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# grid.fit(X_train, y_train)

# # print(grid.best_params_, grid.best_estimator_)
# best_clf = grid.best_estimator_
# print(accuracy_score(y_test, best_clf.predict(X_test)))

# ########## Checking for feature importance 
# importances = DTC_model.feature_importances_
# feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances})
# print(feature_importance.sort_values('importance', ascending=False))


Decision Tree Classifier Accuracy: 0.741

Classification Report (Decision Tree Classifier):
               precision    recall  f1-score   support

           0       0.69      0.49      0.57       707
           1       0.76      0.88      0.81      1293

    accuracy                           0.74      2000
   macro avg       0.72      0.68      0.69      2000
weighted avg       0.73      0.74      0.73      2000

Confusion Matrix:
 [[ 344  363]
 [ 155 1138]]

AUC-ROC: 0.7331414613121903


In [83]:
# from sklearn.model_selection import train_test_split, KFold, cross_val_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# # Assuming X and y are already defined
# models = [
#     ('LR', LogisticRegression()),
#     ('LDA', LinearDiscriminantAnalysis()),
#     ('KNN', KNeighborsClassifier()),
#     ('CART', DecisionTreeClassifier()),
#     ('RF', RandomForestClassifier()),
#     ('NB', GaussianNB()),
#     ('SVM', SVC())
# ]

# results = []
# names = []
# for name, model in models:
#     kfold = KFold(n_splits=10, shuffle=True, random_state=2)
#     cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
#     results.append(cv_results)
#     names.append(name)
#     print(f"{name}: {cv_results.mean():.3f} ({cv_results.std():.3f})")
