In [55]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

%pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.utils import resample


Note: you may need to restart the kernel to use updated packages.


In [56]:
DATASET_PATH = "./dataset_t/"
DATASET_FILE = "dataset_t.csv"
TARGET = "is_click"

## Import dataset

In [57]:
# Import CSV
dataset = pd.read_csv(DATASET_PATH+DATASET_FILE)

# Get shape of dataset
dataset_rows    = dataset.shape[0]
dataset_columns = dataset.shape[1]
print(f"dataset_rows = {dataset_rows}")

dataset_rows = 463291


## Model

In [58]:
X = dataset.drop(TARGET, axis=1)
y = dataset[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)
print(Counter(y_train))

Counter({0: 345567, 1: 25065})


In [59]:
smote = SMOTETomek(random_state=42, sampling_strategy=0.6)
X_train_b, y_train_b = smote.fit_resample(X_train, y_train)
print(Counter(y_train_b))

Counter({0: 324102, 1: 185875})


In [60]:
model = DecisionTreeClassifier(random_state=42)

In [61]:
#param_grid = {
#    'criterion': ['entropy'],  # Criterion to measure the quality of a split
#    'max_depth': [None, 5, 15, 30],  # Maximum depth of the tree
#    'min_samples_split': [5, 10, 15],  # Minimum number of samples required to split an internal node
#    'min_samples_leaf': [5, 10, 15],  # Minimum number of samples required to be at a leaf node
#    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split
#}

param_grid = {
    'criterion': ['entropy'],  # Criterion to measure the quality of a split
    'max_depth': [None],  # Maximum depth of the tree
    'min_samples_split': [15, 20, 25],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [3, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [None]  # Number of features to consider when looking for the best split
}

In [62]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [63]:
grid_search.fit(X_train_b, y_train_b)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [64]:
print(f"Best parameters found: {grid_search.best_params_}")

# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Best parameters found: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 15}
              precision    recall  f1-score   support

           0       0.93      0.88      0.91     86393
           1       0.08      0.14      0.10      6266

    accuracy                           0.83     92659
   macro avg       0.51      0.51      0.50     92659
weighted avg       0.88      0.83      0.85     92659



In [65]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[76435  9958]
 [ 5419   847]]


In [66]:
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.09923261671841134
