# Integration with sklearn pipelines

In this notebook, provide some illustration for integration with sklearn pipelines.

In [46]:
import keras
import imblearn

import numpy as np
import pandas as pd 


import smote_variants as sv
import imblearn.datasets as imb_datasets

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

random_seed= 3

## Preparing the data

In [47]:
np.random.seed(random_seed)

In [48]:
# libras= imb_datasets.fetch_datasets()['libras_move']
# X, y= libras['data'], libras['target']
df = pd.read_csv('credit_dataset.csv')

In [49]:
df['GENDER'] = [0 if x == 'M' else 1 for x in df['GENDER']]
df['CAR'] = [1 if x == 'Y' else 0 for x in df['CAR']]
df['REALITY'] = [1 if x == 'Y' else 0 for x in df['REALITY']]
dummy_income_type = pd.get_dummies(df['INCOME_TYPE'], prefix='INC_TYPE', drop_first=True)
dummy_edu_type = pd.get_dummies(df['EDUCATION_TYPE'], prefix='EDU_TYPE', drop_first=True)
dummy_family_type = pd.get_dummies(df['FAMILY_TYPE'], prefix='FAM_TYPE', drop_first=True)
dummy_house_type = pd.get_dummies(df['HOUSE_TYPE'], prefix='HOUSE_TYPE', drop_first=True)
to_drop = ['Unnamed: 0', 'ID', 'FLAG_MOBIL', 'INCOME_TYPE',
           'EDUCATION_TYPE', 'FAMILY_TYPE', 'HOUSE_TYPE']
df.drop(to_drop, axis=1, inplace=True)

# Fusionar en un solo Dataframe
merged = pd.concat([df, dummy_income_type, dummy_edu_type, dummy_family_type, dummy_house_type], axis=1)
from sklearn.preprocessing import MinMaxScaler

# Escalar solo columnas que tengan valores mayores que 1
to_scale = [col for col in df.columns if df[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(merged[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

# Reemplazar las columnas originales con escalas
for col in scaled:
    merged[col] = scaled[col]
    
merged.head()
data = merged

In [50]:
X, y = data.drop(['TARGET'], axis=1), data['TARGET']

In [51]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25)

## Fitting a pipeline

In [52]:
oversampler= sv.MulticlassOversampling(sv.distance_SMOTE())
classifier= KNeighborsClassifier(n_neighbors= 5)

In [53]:
model= Pipeline([
                ('scale', StandardScaler()),
                ('clf', sv.OversamplingClassifier(oversampler, classifier))
            ])

In [54]:
model.fit(X_train, y_train)

2022-03-27 22:47:19,897:INFO:MulticlassOversampling: Running multiclass oversampling with strategy eq_1_vs_many_successive
2022-03-27 22:47:19,907:INFO:MulticlassOversampling: Sampling minority class with label: 1
2022-03-27 22:47:19,911:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Pipeline(steps=[('scale', StandardScaler()),
                ('clf',
                 OversamplingClassifier(classifier=KNeighborsClassifier(),
                                        oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x000001D39EF2EDA0>))])

## Grid search

In [55]:
param_grid= {'clf__oversampler':[sv.distance_SMOTE(proportion=0.5),
                                 sv.distance_SMOTE(proportion=1.0),
                                 sv.distance_SMOTE(proportion=1.5)]}

In [56]:
grid= GridSearchCV(model, param_grid= param_grid, cv= 3, n_jobs= 1, verbose= 2, scoring= 'accuracy')

In [57]:
grid.fit(X_train, y_train)

2022-03-27 22:47:29,177:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


Fitting 3 folds for each of 3 candidates, totalling 9 fits


2022-03-27 22:47:32,025:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   2.8s


2022-03-27 22:47:35,011:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   2.9s


2022-03-27 22:47:37,884:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   2.8s


2022-03-27 22:47:41,510:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   3.5s


2022-03-27 22:47:45,526:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   3.9s


2022-03-27 22:47:49,567:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   3.9s


2022-03-27 22:47:54,456:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   4.8s


2022-03-27 22:47:58,997:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   4.4s


2022-03-27 22:48:03,825:INFO:distance_SMOTE: Running sampling via ('distance_SMOTE', "{'proportion': 0.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


[CV] END clf__oversampler=('distance_SMOTE', "{'proportion': 1.5, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}"); total time=   4.7s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('clf',
                                        OversamplingClassifier(classifier=KNeighborsClassifier(),
                                                               oversampler=<smote_variants._smote_variants.MulticlassOversampling object at 0x000001D39EF2EDA0>))]),
             n_jobs=1,
             param_grid={'clf__oversampler': [<smote_variants._smote_variants.distance_SMOTE object at 0x000001D39EF2F490>,
                                              <smote_variants._smote_variants.distance_SMOTE object at 0x000001D39EF2F070>,
                                              <smote_variants._smote_variants.distance_SMOTE object at 0x000001D39EF2FAC0>]},
             scoring='accuracy', verbose=2)

In [62]:
# print(grid.best_score_)
print(grid.best_params_)
print(f'Accuracy: {grid.best_score_ * 100:.2f}%')
# print(grid.cv_results_)

{'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE object at 0x000001D39EF2F490>}
Accuracy: 94.37%


In [43]:

grid.cv_results_

{'mean_fit_time': array([0.33528932, 0.63448   , 0.94450045]),
 'std_fit_time': array([0.02122631, 0.01944285, 0.03745864]),
 'mean_score_time': array([4.21421782, 5.53561544, 6.57661366]),
 'std_score_time': array([0.10633069, 0.19713152, 0.06728457]),
 'param_clf__oversampler': masked_array(data=[<smote_variants._smote_variants.distance_SMOTE object at 0x000001D39CE0E9B0>,
                    <smote_variants._smote_variants.distance_SMOTE object at 0x000001D39CE0F400>,
                    <smote_variants._smote_variants.distance_SMOTE object at 0x000001D39CE0CDF0>],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE at 0x1d39ce0e9b0>},
  {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE at 0x1d39ce0f400>},
  {'clf__oversampler': <smote_variants._smote_variants.distance_SMOTE at 0x1d39ce0cdf0>}],
 'split0_test_score': array([0.90212461, 0.8994986