In [56]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

%pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, recall_score
from sklearn.utils import resample


Note: you may need to restart the kernel to use updated packages.


In [57]:
# Defines
#=================================================#
IS_TRAIN_NOT_TEST = True

ENV_LOCAL_MACHINE   = 1
ENV_GOOGLE_COLLABS  = 2
ENV_KAGGLE          = 3
ENVIRONMENT = ENV_LOCAL_MACHINE

#=================================================#
CSV_DATASET_INPUT_TEST          = "dataset_balanced_test.csv"
CSV_DATASET_INPUT_SMOTE_TRAIN   = "dataset_balanced_smote_train.csv"
CSV_DATASET_INPUT_ADASYN_TRAIN  = "dataset_balanced_adasyn_train.csv"
CSV_DATASET_INPUT_TOMEK_TRAIN   = "dataset_balanced_tomek_train.csv"
CSV_DATASET_INPUT_ENN_TRAIN     = "dataset_balanced_enn_train.csv"

if ENVIRONMENT == ENV_LOCAL_MACHINE:
    PATH_DATASET_INPUT  = "./dataset_t/"
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    PATH_DATASET_INPUT  = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/dataset_t/"

FEATURE_TARGET = "is_click"

## Import dataset

In [58]:
# Import CSV
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    from google.colab import drive
    drive.mount('/content/drive')
dataset_train = pd.read_csv(PATH_DATASET_INPUT+CSV_DATASET_INPUT_TOMEK_TRAIN)

X_train = dataset_train.drop(FEATURE_TARGET, axis=1)
y_train = dataset_train[FEATURE_TARGET]
print(Counter(y_train))

Counter({0: 329045, 1: 87148})


In [59]:
dataset_test = pd.read_csv(PATH_DATASET_INPUT+CSV_DATASET_INPUT_TEST)

X_test = dataset_test.drop(FEATURE_TARGET, axis=1)
y_test = dataset_test[FEATURE_TARGET]
print(Counter(y_test))

Counter({0: 86393, 1: 6266})


## Model

In [60]:
model = DecisionTreeClassifier(random_state=42)

In [61]:
#param_grid = {
#    'criterion': ['entropy'],  # Criterion to measure the quality of a split
#    'max_depth': [None, 5, 15, 30],  # Maximum depth of the tree
#    'min_samples_split': [5, 10, 15],  # Minimum number of samples required to split an internal node
#    'min_samples_leaf': [5, 10, 15],  # Minimum number of samples required to be at a leaf node
#    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split
#}
#
# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# 
# grid_search.fit(X_train, y_train)
# 
# print(f"Best parameters found: {grid_search.best_params_}")
# 
# # Make predictions with the best model
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

In [64]:
param_dist = {
    'criterion': ['gini', 'entropy'],  # criteria for splitting
    'max_depth': np.arange(1, 20),  # maximum depth of the tree
    'min_samples_split': np.arange(2, 20),  # minimum samples required to split an internal node
    'min_samples_leaf': np.arange(1, 20)  # minimum samples required to be at a leaf node
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,  # number of parameter settings sampled
    scoring='f1_weighted',  # evaluation metric
    cv=5,  # 5-fold cross-validation
    random_state=42  # for reproducibility
)

random_search.fit(X_train, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score (weighted F1): {:.2f}".format(random_search.best_score_))

# Evaluate the model on the test set
best_model = random_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print("Test set score: {:.2f}".format(test_score))


# Make predictions with the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_f1_score = f1_score(y_test, y_pred, average='weighted')
print("Test set F1 score (weighted): {:.2f}".format(test_f1_score))

Best parameters found:  {'min_samples_split': np.int64(3), 'min_samples_leaf': np.int64(7), 'max_depth': np.int64(19), 'criterion': 'gini'}
Best cross-validation score (weighted F1): 0.84
Test set score: 0.92
Test set F1 score (weighted): 0.89


In [66]:
# Print classification report
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

test_recall_score = recall_score(y_test, y_pred, average='weighted')
print("Test set recall score (weighted): {:.2f}".format(test_recall_score))

              precision    recall  f1-score   support

           0       0.93      0.98      0.96     86393
           1       0.08      0.02      0.03      6266

    accuracy                           0.92     92659
   macro avg       0.50      0.50      0.49     92659
weighted avg       0.87      0.92      0.89     92659

Confusion Matrix:
[[85018  1375]
 [ 6153   113]]
F1 Score: 0.02914624709827186
Test set recall score (weighted): 0.92
