In [72]:
# Defines
#=================================================#
IS_TRAIN_NOT_TEST = True

ENV_LOCAL_MACHINE   = 1
ENV_GOOGLE_COLLABS  = 2
ENV_KAGGLE          = 3
ENVIRONMENT = ENV_LOCAL_MACHINE

#=================================================#
CSV_DATASET_INPUT           = "dataset_t.csv"
CSV_DATASET_OUTPUT_TEST     = "dataset_balanced_test.csv"
CSV_DATASET_OUTPUT_SMOTE_TRAIN      = "dataset_balanced_smote_train.csv"
CSV_DATASET_OUTPUT_ADASYN_TRAIN     = "dataset_balanced_adasyn_train.csv"
CSV_DATASET_OUTPUT_TOMEK_TRAIN      = "dataset_balanced_tomek_train.csv"
CSV_DATASET_OUTPUT_ENN_TRAIN        = "dataset_balanced_enn_train.csv"


if ENVIRONMENT == ENV_LOCAL_MACHINE:
    PATH_DATASET_INPUT  = "./dataset_t/"
    PATH_DATASET_OUTPUT = "./dataset_t/"
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    PATH_DATASET_INPUT  = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/dataset_t/"
    PATH_DATASET_OUTPUT = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/dataset_t/"

FEATURE_TARGET = "is_click"

RANDOM_STATE = 214
OVERSAMPLING_RATIO = 0.3
UNDERSAMPLING_RATIO = 0.8
N_JOBS = -1

In [73]:
# Import libraries
import pandas as pd
import numpy as np

if ENVIRONMENT == ENV_LOCAL_MACHINE:
    %pip install imblearn
else:
    !pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split
from collections import Counter

Note: you may need to restart the kernel to use updated packages.


In [74]:
# Import dataset
dataset = pd.read_csv(PATH_DATASET_INPUT+CSV_DATASET_INPUT)
dataset_size = dataset.size

In [75]:
X = dataset.drop(FEATURE_TARGET, axis=1)
y = dataset[FEATURE_TARGET]

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_STATE, stratify=y)
print(Counter(y_train))

Counter({0: 345567, 1: 25065})


In [76]:
# Save test dataset apart to another file
pd.concat([X_test, y_test], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_TEST, index=False)


# RANDOM UNDER SAMPLING

In [77]:
counter = []
for i in Counter(y_train).values():
    counter.append(i)
counter_0 = counter[0]
counter_1 = counter[1]

In [78]:
rus = RandomUnderSampler(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO), 1: counter_1},
    random_state=RANDOM_STATE,
    replacement=False
)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
print(Counter(y_train_rus))

Counter({0: 276453, 1: 25065})


# SMOTE

In [79]:
smote = SMOTE(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_STATE,
    k_neighbors=5,
    n_jobs=N_JOBS
)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print(Counter(y_train_res))




Counter({0: 345567, 1: 103670})


In [80]:
pd.concat([X_train_res, y_train_res], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_SMOTE_TRAIN, index=False)

# ADASYN

In [81]:
adasyn = ADASYN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_STATE,
    n_neighbors=5,
    n_jobs=N_JOBS
)
X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
print(Counter(y_train_res))



Counter({0: 345567, 1: 110760})


In [82]:
pd.concat([X_train_res, y_train_res], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_ADASYN_TRAIN, index=False)

# SMOTE + Tomek Links

In [83]:
smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_STATE,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_res, y_train_res = smote_tomek.fit_resample(X_train, y_train)
print(Counter(y_train_res))



Counter({0: 329045, 1: 87148})


In [84]:
pd.concat([X_train_res, y_train_res], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_TOMEK_TRAIN, index=False)

# SMOTE + ENN

In [85]:
smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_STATE,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_res, y_train_res = smote_enn.fit_resample(X_train, y_train)
print(Counter(y_train_res))



Counter({0: 243721, 1: 17357})


In [86]:
pd.concat([X_train_res, y_train_res], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_ENN_TRAIN, index=False)