In [32]:
# Defines
#=================================================#
IS_TRAIN_NOT_TEST = True

ENV_LOCAL_MACHINE   = 1
ENV_GOOGLE_COLLABS  = 2
ENV_KAGGLE          = 3
ENVIRONMENT = ENV_LOCAL_MACHINE

#=================================================#
RANDOM_SEED = 214
OVERSAMPLING_RATIO = 0.3
UNDERSAMPLING_RATIO = 0.8
N_JOBS = -1

#=================================================#
PATH_INPUT              = "dataset/creditcard/"
PATH_OUTPUT             = "outputs/creditcard/"

CSV_DATASET_RAW                     = "creditcard.csv"
CSV_DATASET_TRAIN                   = "dataset_train.csv"
CSV_DATASET_TEST                    = "dataset_test.csv"
CSV_DATASET_TRAIN_TRANSFORMED       = "dataset_train_t.csv"
CSV_DATASET_TEST_TRANSFORMED        = "dataset_test_t.csv"
CSV_DATASET_VAL_TRANSFORMED         = "dataset_val_t.csv"

CSV_DATASET_BALANCED_RUS_TRAIN      = "dataset_train_t_b_rus.csv"
CSV_DATASET_BALANCED_NM_TRAIN       = "dataset_train_t_b_nm.csv"
CSV_DATASET_BALANCED_SMOTE_TRAIN    = "dataset_train_t_b_smote.csv"
CSV_DATASET_BALANCED_ADASYN_TRAIN   = "dataset_train_t_b_adasyn.csv"
CSV_DATASET_BALANCED_TOMEK_TRAIN    = "dataset_train_t_b_tomek.csv"
CSV_DATASET_BALANCED_ENN_TRAIN      = "dataset_train_t_b_enn.csv"

CSV_DATASET_BALANCED_NM80_TOMEK30_TRAIN      = "dataset_train_t_b_nm80_tomek30.csv"
CSV_DATASET_BALANCED_NM65_TOMEK30_TRAIN      = "dataset_train_t_b_nm65_tomek30.csv"
CSV_DATASET_BALANCED_NM80_TOMEK50_TRAIN      = "dataset_train_t_b_nm80_tomek50.csv"
CSV_DATASET_BALANCED_NM90_TOMEK15_TRAIN      = "dataset_train_t_b_nm90_tomek15.csv"

if ENVIRONMENT == ENV_LOCAL_MACHINE:
    PATH_DATASET_BASE  = "../"
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    PATH_DATASET_BASE  = "drive/MyDrive/UTN_Finales/[F] Aprendizaje Automatico/Repositorio/"

FEATURE_TARGET = "Class"

In [33]:
# Import libraries
import pandas as pd
import numpy as np

%pip install imblearn
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler, NearMiss

from sklearn.model_selection import train_test_split
from collections import Counter

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
# Import dataset
if ENVIRONMENT == ENV_GOOGLE_COLLABS:
    from google.colab import drive
    drive.mount('/content/drive')
    
dataset = pd.read_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_TRAIN_TRANSFORMED)
dataset_size = dataset.size

In [35]:
X = dataset.drop(FEATURE_TARGET, axis=1)
y = dataset[FEATURE_TARGET]

# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RANDOM_SEED, stratify=y)
print(Counter(y_train))

Counter({0: 68236, 1: 118})


In [36]:
# Save test validation dataset apart to another file
dataset_val = pd.concat([X_test, y_test], axis=1)
dataset_val.to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_VAL_TRANSFORMED, index=False)

# Save train dataset apart to another file
dataset = pd.concat([X_train, y_train], axis=1)
dataset.to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_TRAIN_TRANSFORMED, index=False)


# RANDOM UNDER SAMPLING

In [37]:
counter = []
for i in Counter(y_train).values():
    counter.append(i)
counter_0 = counter[0]
counter_1 = counter[1]

In [38]:
rus = RandomUnderSampler(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO), 1: counter_1},
    random_state=RANDOM_SEED,
    replacement=False
)
X_train_us_rus, y_train_us_rus = rus.fit_resample(X_train, y_train)
print(Counter(y_train_us_rus))

Counter({0: 54588, 1: 118})


In [39]:
pd.concat([X_train_us_rus, y_train_us_rus], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_RUS_TRAIN, index=False)

# NEAR MISS

In [40]:
for parameter in dataset.columns:
    print(f"NaN counts for '{parameter}': ", dataset[parameter].isna().sum()+np.isinf(dataset[parameter]).sum())

NaN counts for 'Time':  0
NaN counts for 'V1':  0
NaN counts for 'V2':  0
NaN counts for 'V3':  0
NaN counts for 'V4':  0
NaN counts for 'V5':  0
NaN counts for 'V6':  0
NaN counts for 'V7':  0
NaN counts for 'V8':  0
NaN counts for 'V9':  0
NaN counts for 'V10':  0
NaN counts for 'V11':  0
NaN counts for 'V12':  0
NaN counts for 'V14':  0
NaN counts for 'V16':  0
NaN counts for 'V17':  0
NaN counts for 'V18':  0
NaN counts for 'V19':  0
NaN counts for 'V20':  0
NaN counts for 'V21':  0
NaN counts for 'V22':  0
NaN counts for 'V27':  0
NaN counts for 'V28':  0
NaN counts for 'Class':  0


In [41]:
nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))


Counter({0: 54588, 1: 118})


In [42]:
pd.concat([X_train_us_rus, y_train_us_rus], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM_TRAIN, index=False)

# SMOTE

In [43]:
smote = SMOTE(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    k_neighbors=5,
    n_jobs=N_JOBS
)
X_train_os_smote, y_train_os_smote = smote.fit_resample(X_train, y_train)
print(Counter(y_train_os_smote))




Counter({0: 68236, 1: 20470})


In [44]:
pd.concat([X_train_os_smote, y_train_os_smote], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_SMOTE_TRAIN, index=False)

# ADASYN

In [45]:
adasyn = ADASYN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    n_neighbors=5,
    n_jobs=N_JOBS
)
X_train_os_ada, y_train_os_ada = adasyn.fit_resample(X_train, y_train)
print(Counter(y_train_os_ada))



Counter({0: 68236, 1: 20489})


In [46]:
pd.concat([X_train_os_ada, y_train_os_ada], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_ADASYN_TRAIN, index=False)

# SMOTE + Tomek Links

In [47]:
smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_os_tom, y_train_os_tom = smote_tomek.fit_resample(X_train, y_train)
print(Counter(y_train_os_tom))



Counter({0: 68214, 1: 20448})


In [48]:
pd.concat([X_train_os_tom, y_train_os_tom], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_TOMEK_TRAIN, index=False)

# SMOTE + ENN

In [49]:
smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_os_enn, y_train_os_enn = smote_enn.fit_resample(X_train, y_train)
print(Counter(y_train_os_enn))



Counter({0: 67738, 1: 20362})


In [50]:
pd.concat([X_train_os_enn, y_train_os_enn], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_ENN_TRAIN, index=False)

# SMOTE Tomek Links + Near Miss

## 0.80 / 0.30

In [51]:
UNDERSAMPLING_RATIO_T = 0.8
OVERSAMPLING_RATIO_T = 0.3

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us80_os30, y_train_us80_os30 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us80_os30))

pd.concat([X_train_us80_os30, y_train_us80_os30], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM80_TOMEK30_TRAIN, index=False)


Counter({0: 54588, 1: 118})




Counter({0: 54588, 1: 16376})


## 0.65 / 0.30

In [52]:
UNDERSAMPLING_RATIO_T = 0.65
OVERSAMPLING_RATIO_T = 0.30

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us65_os30, y_train_us65_os30 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us65_os30))

pd.concat([X_train_us65_os30, y_train_us65_os30], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM65_TOMEK30_TRAIN, index=False)

Counter({0: 44353, 1: 118})




Counter({0: 44353, 1: 13305})


## 0.80 / 0.50

In [53]:
UNDERSAMPLING_RATIO_T = 0.8
OVERSAMPLING_RATIO_T = 0.5

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us80_os50, y_train_us80_os50 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us80_os50))

pd.concat([X_train_us80_os50, y_train_us80_os50], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM80_TOMEK50_TRAIN, index=False)

Counter({0: 54588, 1: 118})




Counter({0: 54588, 1: 16376})


## 0.90 / 0.15

In [54]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.15

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_tomek = SMOTETomek(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    tomek=None,
    n_jobs=N_JOBS
)
X_train_us90_os15, y_train_us90_os15 = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os15))

pd.concat([X_train_us90_os15, y_train_us90_os15], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM90_TOMEK15_TRAIN, index=False)

Counter({0: 61412, 1: 118})




Counter({0: 61412, 1: 18423})


# SMOTE ENN + Near Miss

In [55]:
CSV_DATASET_BALANCED_NM90_ENN15_TRAIN = "dataset_balanced_train_nm90_enn15.csv"
CSV_DATASET_BALANCED_NM90_ENN30_TRAIN = "dataset_balanced_train_nm90_enn30.csv"

## 0.90 / 0.15

In [56]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.15

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_us90_os15_enn, y_train_us90_os15_enn = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os15_enn))

pd.concat([X_train_us90_os15_enn, y_train_us90_os15_enn], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM90_ENN15_TRAIN, index=False)

Counter({0: 61412, 1: 118})




Counter({0: 61412, 1: 18423})


## 0.90 / 0.30

In [57]:
UNDERSAMPLING_RATIO_T = 0.9
OVERSAMPLING_RATIO_T = 0.30

nm = NearMiss(
    sampling_strategy={0: int(counter_0*UNDERSAMPLING_RATIO_T), 1: counter_1},
    version=1)
X_train_us_nm, y_train_us_nm = nm.fit_resample(X_train, y_train)
print(Counter(y_train_us_nm))

smote_enn = SMOTEENN(
    sampling_strategy=OVERSAMPLING_RATIO,
    random_state=RANDOM_SEED,
    smote=None,
    enn=None,
    n_jobs=N_JOBS
)
X_train_us90_os30_enn, y_train_us90_os30_enn = smote_tomek.fit_resample(X_train_us_nm, y_train_us_nm)
print(Counter(y_train_us90_os30_enn))

pd.concat([X_train_us90_os30_enn, y_train_us90_os30_enn], axis=1).to_csv(PATH_DATASET_BASE+PATH_OUTPUT+CSV_DATASET_BALANCED_NM90_ENN30_TRAIN, index=False)

Counter({0: 61412, 1: 118})




Counter({0: 61412, 1: 18423})
