In [6]:

# This notebook is used to
# 1. Refine the classifiers to mitigate data imbalance
# 2. Create ensemble classifier
# 3. Perform feature ablation

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import copy
from joblib import dump, load

plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.right" : False,
    "legend.frameon" : False,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 300,
})

# suppress sklearn deprecated warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn

In [7]:
# TODO make test for dataset with creatina column
# Dataset without thyroid = 18 features (including survive7y)
# Dataset with thyroid = 27 features (including survive7y)
# With columns that have missing values, 23 and 32
# Default 18
n_features = 27
extra_path = n_features != 27 and n_features != 18
dropped_na_key = "dropped_na/"
mean_key = "mean/"
path = f"data/{n_features}features/{mean_key if extra_path else '' }"
path_models = f"models/{n_features}features/{mean_key if extra_path else '' }"
path_images = f"figures/{n_features}features/{mean_key if extra_path else '' }"
output_models = f"models_output/{n_features}features/{mean_key if extra_path else '' }"
print(path_models)
print(path)

models/27features/
data/27features/


### Sampling
Oversample and undersample methods to mitigate data imbalance

In [8]:
# Load Data for sampling
df_train = pd.read_csv(f"{path}train.csv", index_col=0)
df_valid = pd.read_csv(f"{path}valid.csv", index_col=0)
df_test = pd.read_csv(f"{path}test.csv", index_col=0)
print(len(df_train)+len(df_valid)+len(df_test))
feat_names = list(df_train.columns)
train, valid, test = df_train.to_numpy(), df_valid.to_numpy(), df_test.to_numpy()
X_train, y_train = train[:, :-1], train[:, -1]
X_valid, y_valid = valid[:, :-1], valid[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

from utils import get_preprocess_std_num
preprocess_std = get_preprocess_std_num(feat_names)


# Preprocessed ready-to-use train and valid set
process_tmp = preprocess_std.fit(X_train)
X_train_std = process_tmp.transform(X_train)
X_valid_std = process_tmp.transform(X_valid)


6667


In [9]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
from collections import Counter
from train import evaluate
from utils import datasetSampler
from functools import partial
from train import evaluate
from contextlib import redirect_stdout
# Over sample to 50/
overs = [
    ("smote", SMOTE(sampling_strategy=1.0, k_neighbors=1)),
    ("bordersmote", BorderlineSMOTE(sampling_strategy=1.0, k_neighbors=1)),
    ("svmsmote", SVMSMOTE(sampling_strategy=1.0, k_neighbors=1)), 
    # ADASYN(sampling_strategy=1.0, n_neighbors=1)
]

sampler = partial(
    datasetSampler,
    # missing parameters: model, overSampler, sampling_strategy
    X_train=X_train,
    y_train=y_train,
    X_valid=X_valid,
    y_valid=y_valid,
    useUnderSampler = False
)
random_ratio = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
name_models = ["lr", "svc", "knn", "rf", "adaboost", "nn", "gb", "xgb"]
for name in name_models:
    best_k = 0
    best_over = ""
    best_score = 0
    best_random = 0
    best_model = None
    file_name = ""
    sampling_method = ""
    best_X, best_y = None, None

    for r in random_ratio:
        print("-------------------------------------------------")
        print(f"{'Oversample':<20}{'Score':<20}Random {r} Name {name}")
        print("-------------------------------------------------")
        for underSampler in range(1,3):
            print(f"########## Changing sampling mode ##########")
            for name_over, over in overs:
                for k in range(2, 5):
                    model = load(path_models+f"{name}.joblib")
                    over.set_params(k_neighbors=k)
                    result = sampler(
                        model_name= name,
                        model = model,
                        overSampler = over,
                        sampling_strategy = r,
                        useUnderSampler = True if underSampler == 2 else False
                    )
                    score = result[0]
                    X_train_sample = result[1]
                    y_train_sample = result[2]
                    model = result[3]
                    if score > best_score:
                        best_k = k
                        best_over = name_over
                        best_score = score
                        best_random = r
                        best_model = copy.deepcopy(model)
                        sampling_method = "only oversampling" if underSampler == 1 else "undersampling + oversampling"
                        suffix = "_only_oversampling" if underSampler == 1 else "_sampling"
                        file_name = output_models.replace("models_output/", '').replace("/","") + suffix + ".txt"
                        best_X = X_train_sample
                        best_y = y_train_sample
                #in this case the score is the last score computed with k = 4
                print(f'{name_over:<20}{score:.3f}')

    # Save the dataset and model
    combination = f"_random_{best_over}_{name}"
    tmp = np.concatenate((best_X, np.expand_dims(best_y, 1)), axis=1)
    tmp = pd.DataFrame(tmp, columns=feat_names)

    # Current model's statistics
    print("\n")
    print(f"Name model:      \t{name}")
    print(f"Sampling method: {sampling_method }")
    print(f"Best rand_ratio: \t{best_random}")
    print(f"Best score:      \t{best_score}")
    print(f"Dataset size:    \t{len(best_y)}, {Counter(best_y)}")
    print(f"Combination:     \t{combination}")
    print("\n\n")

    # Evaluate the best model, save the data and the best model
    evaluate(best_model, best_X, best_y)
    evaluate(best_model, X_valid, y_valid)
    tmp.to_csv(path+f"train{combination}.csv")
    dump(best_model, path_models+f"{name}{combination}.joblib")
    with open(f"{output_models}{file_name}", 'a+') as f:
            with redirect_stdout(f):
                print (f"####################   {name}    #########################")
                print("Testing on training set:")
                evaluate(best_model, best_X, best_y)
                print("Testing on validation set:")
                evaluate(best_model, X_valid, y_valid)
                print("Parameters: {0}".format(model.get_params()))
                print (f"####################   {name}  END   #########################")

-------------------------------------------------
Oversample          Score               Random 0.2 Name lr
-------------------------------------------------
########## Changing sampling mode ##########
smote               0.656
bordersmote         0.620
svmsmote            0.687
########## Changing sampling mode ##########
smote               0.627
bordersmote         0.646
svmsmote            0.689
-------------------------------------------------
Oversample          Score               Random 0.25 Name lr
-------------------------------------------------
########## Changing sampling mode ##########
smote               0.626
bordersmote         0.635
svmsmote            0.698
########## Changing sampling mode ##########
smote               0.664
bordersmote         0.615
svmsmote            0.650
-------------------------------------------------
Oversample          Score               Random 0.3 Name lr
-------------------------------------------------
########## Changing sampling m

In [10]:
from auto_export_notebook import export_current_notebook


html_path = export_current_notebook(
    globals(),
    wait_for_disk_save=True,   # wait for Auto Save
    wait_timeout_sec=8.0
)
print("Exported to:", html_path)


<IPython.core.display.Javascript object>

Exported to: /home/ileniag/buzi_ml4cad_0/exported_notebooks/3.1_data_sampling_27features_20250929_232726.html
