In [36]:
# Defines
#=================================================#
IS_TRAIN_NOT_TEST = True

ENV_LOCAL_MACHINE   = 1
ENV_GOOGLE_COLLABS  = 2
ENV_KAGGLE          = 3
ENVIRONMENT = ENV_LOCAL_MACHINE

#=================================================#
RANDOM_SEED = 214
OVERSAMPLING_RATIO = 0.3
UNDERSAMPLING_RATIO = 0.8
N_JOBS = -1

#=================================================#
CSV_DATASET_INPUT                   = "dataset_transformed.csv"
CSV_DATASET_OUTPUT_TEST             = "dataset_balanced_test.csv"
CSV_DATASET_OUTPUT_TRAIN            = "dataset_balanced_train.csv"
CSV_DATASET_OUTPUT_RUS_TRAIN        = "dataset_balanced_train_rus.csv"
CSV_DATASET_OUTPUT_NM_TRAIN         = "dataset_balanced_train_nm.csv"
CSV_DATASET_OUTPUT_SMOTE_TRAIN      = "dataset_balanced_train_smote.csv"
CSV_DATASET_OUTPUT_ADASYN_TRAIN     = "dataset_balanced_train_adasyn.csv"
CSV_DATASET_OUTPUT_TOMEK_TRAIN      = "dataset_balanced_train_tomek.csv"
CSV_DATASET_OUTPUT_ENN_TRAIN        = "dataset_balanced_train_enn.csv"

if ENVIRONMENT == ENV_LOCAL_MACHINE:
    PATH_DATASET_INPUT  = "./outputs/"
    PATH_DATASET_OUTPUT = "./outputs/"
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    PATH_DATASET_INPUT  = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/outputs/"
    PATH_DATASET_OUTPUT = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/outputs/"

FEATURE_TARGET = "is_click"

In [18]:
# Import libraries
import pandas as pd
import numpy as np

%pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler, NearMiss

from sklearn.model_selection import train_test_split
from collections import Counter

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# Import dataset
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    from google.colab import drive
    drive.mount('/content/drive')
    
dataset = pd.read_csv(PATH_DATASET_INPUT+CSV_DATASET_INPUT)
dataset_size = dataset.size

In [20]:
X = dataset.drop(FEATURE_TARGET, axis=1)
y = dataset[FEATURE_TARGET]

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_SEED, stratify=y)
print(Counter(y_train))

Counter({0: 345567, 1: 25065})


In [37]:
# Save test dataset apart to another file
pd.concat([X_test, y_test], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_TEST, index=False)

# Save imbalanced train dataset apart to another file
pd.concat([X_train, y_train], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_TRAIN, index=False)


# RANDOM UNDER SAMPLING

In [22]:
counter = []
for i in Counter(y_train).values():
    counter.append(i)
counter_0 = counter[0]
counter_1 = counter[1]

In [23]:
rus = RandomUnderSampler(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO), 1: counter_1},
    random_state=RANDOM_SEED,
    replacement=False
)
X_train_us_rus, y_train_us_rus = rus.fit_resample(X_train, y_train)
print(Counter(y_train_us_rus))

Counter({0: 276453, 1: 25065})


In [24]:
pd.concat([X_train_us_rus, y_train_us_rus], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_RUS_TRAIN, index=False)

# NEAR MISS

In [25]:
nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))


Counter({0: 276453, 1: 25065})


In [26]:
pd.concat([X_train_us_rus, y_train_us_rus], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM_TRAIN, index=False)

# SMOTE

In [27]:
smote = SMOTE(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    k_neighbors=5,
    n_jobs=N_JOBS
)
X_train_os_smote, y_train_os_smote = smote.fit_resample(X_train, y_train)
print(Counter(y_train_os_smote))




Counter({0: 345567, 1: 103670})


In [28]:
pd.concat([X_train_os_smote, y_train_os_smote], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_SMOTE_TRAIN, index=False)

# ADASYN

In [29]:
adasyn = ADASYN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    n_neighbors=5,
    n_jobs=N_JOBS
)
X_train_os_ada, y_train_os_ada = adasyn.fit_resample(X_train, y_train)
print(Counter(y_train_os_ada))



Counter({0: 345567, 1: 110760})


In [30]:
pd.concat([X_train_os_ada, y_train_os_ada], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_ADASYN_TRAIN, index=False)

# SMOTE + Tomek Links

In [31]:
smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_os_tom, y_train_os_tom = smote_tomek.fit_resample(X_train, y_train)
print(Counter(y_train_os_tom))



Counter({0: 329045, 1: 87148})


In [32]:
pd.concat([X_train_os_tom, y_train_os_tom], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_TOMEK_TRAIN, index=False)

# SMOTE + ENN

In [33]:
smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_os_enn, y_train_os_enn = smote_enn.fit_resample(X_train, y_train)
print(Counter(y_train_os_enn))



Counter({0: 243721, 1: 17357})


In [34]:
pd.concat([X_train_os_enn, y_train_os_enn], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_ENN_TRAIN, index=False)

# SMOTE Tomek Links + Near Miss

In [43]:
CSV_DATASET_OUTPUT_NM80_TOMEK30_TRAIN      = "dataset_balanced_train_nm80_tomek30.csv"
CSV_DATASET_OUTPUT_NM65_TOMEK30_TRAIN      = "dataset_balanced_train_nm65_tomek30.csv"
CSV_DATASET_OUTPUT_NM80_TOMEK50_TRAIN      = "dataset_balanced_train_nm80_tomek50.csv"
CSV_DATASET_OUTPUT_NM90_TOMEK15_TRAIN      = "dataset_balanced_train_nm90_tomek15.csv"

## 0.80 / 0.30

In [38]:
UNDERSAMPLING_RATIO_T = 0.8
OVERSAMPLING_RATIO_T = 0.3

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us80_os30, y_train_us80_os30 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us80_os30))

pd.concat([X_train_us80_os30, y_train_us80_os30], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM80_TOMEK30_TRAIN, index=False)


Counter({0: 276453, 1: 25065})




Counter({0: 262602, 1: 69084})


NameError: name 'CSV_DATASET_OUTPUT_NM80_TOMEK30_TRAIN' is not defined

## 0.65 / 0.30

In [41]:
UNDERSAMPLING_RATIO_T = 0.65
OVERSAMPLING_RATIO_T = 0.30

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us65_os30, y_train_us65_os30 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us65_os30))

pd.concat([X_train_us65_os30, y_train_us65_os30], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM65_TOMEK30_TRAIN, index=False)

Counter({0: 224618, 1: 25065})




Counter({0: 213883, 1: 56650})


## 0.80 / 0.50

In [42]:
UNDERSAMPLING_RATIO_T = 0.8
OVERSAMPLING_RATIO_T = 0.5

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us80_os50, y_train_us80_os50 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us80_os50))

pd.concat([X_train_us80_os50, y_train_us80_os50], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM80_TOMEK50_TRAIN, index=False)

Counter({0: 276453, 1: 25065})




Counter({0: 262602, 1: 69084})


## 0.90 / 0.15

In [44]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.15

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us90_os15, y_train_us90_os15 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os15))

pd.concat([X_train_us90_os15, y_train_us90_os15], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM90_TOMEK15_TRAIN, index=False)

Counter({0: 311010, 1: 25065})




Counter({0: 295558, 1: 77851})


# SMOTE ENN + Near Miss

In [48]:
CSV_DATASET_OUTPUT_NM90_ENN15_TRAIN = "dataset_balanced_train_nm90_enn15.csv"
CSV_DATASET_OUTPUT_NM90_ENN30_TRAIN = "dataset_balanced_train_nm90_enn30.csv"

## 0.90 / 0.15

In [47]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.15

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_us90_os15_enn, y_train_us90_os15_enn = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os15_enn))

pd.concat([X_train_us90_os15_enn, y_train_us90_os15_enn], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM90_ENN15_TRAIN, index=False)

Counter({0: 311010, 1: 25065})




Counter({0: 295558, 1: 77851})


## 0.90 / 0.30

In [49]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.30

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_us90_os30_enn, y_train_us90_os30_enn = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os30_enn))

pd.concat([X_train_us90_os30_enn, y_train_us90_os30_enn], axis=1).to_csv(PATH_DATASET_OUTPUT+CSV_DATASET_OUTPUT_NM90_ENN30_TRAIN, index=False)

Counter({0: 311010, 1: 25065})




Counter({0: 295558, 1: 77851})
