In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

%pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report
from sklearn.utils import resample


Note: you may need to restart the kernel to use updated packages.


In [2]:
DATASET_PATH = "./dataset_t/"
DATASET_FILE = "dataset_t.csv"
TARGET = "is_click"

## Import dataset

In [3]:
# Import CSV
dataset = pd.read_csv(DATASET_PATH+DATASET_FILE)

# Get shape of dataset
dataset_rows    = dataset.shape[0]
dataset_columns = dataset.shape[1]
print(f"dataset_rows = {dataset_rows}")

dataset_rows = 463291


## Model

In [4]:
X = dataset.drop(TARGET, axis=1)
y = dataset[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify=y)
print(Counter(y_train))

Counter({0: 345567, 1: 25065})


In [6]:
smote = SMOTE(random_state=42, sampling_strategy=0.6)
X_train_b, y_train_b = smote.fit_resample(X_train, y_train)
print(Counter(y_train_b))

Counter({0: 345567, 1: 207340})


In [7]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

model = RandomForestClassifier(random_state=42)

In [9]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [10]:
grid_search.fit(X_train_b, y_train_b)

Fitting 3 folds for each of 1620 candidates, totalling 4860 fits


In [None]:
print(f"Best parameters found: {grid_search.best_params_}")

# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))

Best parameters found: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 15}
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     86393
           1       0.08      0.10      0.09      6266

    accuracy                           0.86     92659
   macro avg       0.51      0.51      0.51     92659
weighted avg       0.88      0.86      0.87     92659



In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[79537  6856]
 [ 5654   612]]


In [None]:
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.0891218872870249
