In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline

In [5]:
path_file = 'C:/Users/JulesBoutibou/Documents/Perso/PIP/DATA/Data_G2/Donnees_v1.csv'
df = pd.read_csv(path_file)

In [6]:
df = df.sample(frac = 0.1)
df.TOP_FRAUDE.value_counts()

0    84775
1      166
Name: TOP_FRAUDE, dtype: int64

In [7]:
X = df.drop(columns = ['TOP_FRAUDE_VIREMENT','TOP_FRAUDE_CARTE','TOP_FRAUDE','Unnamed: 0', 'ID'])
y = df.TOP_FRAUDE

# Sampling

### Nearmiss (Only undersampling)

In [8]:
from imblearn.under_sampling import NearMiss

In [9]:
params_nearmiss = {
    'u__version': [1, 2], # Don't touch this param
    'u__sampling_strategy': [0.005, 0.01, 0.05, 0.1], # Ratio of outliers / inliers
    'u__n_neighbors': [1, 3, 5, 7] 
}

### SVMSMOTE (Only oversampling)

In [10]:
from imblearn.over_sampling import SVMSMOTE
from sklearn.svm import SVC

In [11]:
params_svmsmote = {
    'o__sampling_strategy': [0.005, 0.01, 0.03, 0.1],
    'o__k_neighbors': [3, 16],
    'o__m_neighbors': [3, 16],
    'o__svm_estimator': [SVC(kernel='poly'), SVC(kernel='sigmoid'), SVC(kernel='rbf')]
}

### SMOTE and CC (Oversampling and Undersampling)

In [12]:
# combined SMOTE and Edited Nearest Neighbors sampling for imbalanced classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import ClusterCentroids 

In [13]:
params_smotecc = {
    'o__sampling_strategy': [0.005, 0.01, 0.03, 0.1],
    'o__k_neighbors': [3, 10],
    'u__sampling_strategy': [0.1, 0.2, 0.3],
    'u__n_jobs': [-1]
}

## Testing models

#### General lib

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

#### Models

In [15]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

#### Initialize ML Models

In [16]:
# Create your model here with the best params
model = LogisticRegression(C=0.1, class_weight= 'balanced',  max_iter = 500)
model = XGBClassifier(objective='binary:hinge', use_label_encoder=False)

#### Initialize Pipielines

In [17]:
# Oversamplers
over_smote = SMOTE()
over_svmsmote = SVMSMOTE()

# Undersamplers
under_nearmiss = NearMiss()
under_enn = EditedNearestNeighbours()
under_cc = ClusterCentroids()

# Pipelines
pipeline_nearmiss = Pipeline([('u', under_nearmiss), ('m', model)])
pipeline_svmsmote = Pipeline([('o', over_svmsmote), ('m', model)])
pipeline_smotecc = Pipeline([('o', over_smote),('u', under_cc), ('m', model)])

In [18]:
from sklearn.metrics import fbeta_score,make_scorer
f2 = make_scorer(fbeta_score )

In [19]:
scoring = {"AUC": "roc_auc", "F1": 'f1', "F2": f2}

In [23]:
grid = GridSearchCV(pipeline_nearmiss, params_nearmiss, scoring=scoring, verbose=3, refit = 'F2', cv=5)
grid_res = grid.fit(X, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


Traceback (most recent call last):
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 388, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\base.py", line 77, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
  File "C:\Users\JulesBo

[CV 1/5] END u__n_neighbors=1, u__sampling_strategy=0.005, u__version=1; AUC: (test=nan) F1: (test=nan) F2: (test=nan) total time=   0.7s


Traceback (most recent call last):
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 388, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\base.py", line 77, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
  File "C:\Users\JulesBo

[CV 2/5] END u__n_neighbors=1, u__sampling_strategy=0.005, u__version=1; AUC: (test=nan) F1: (test=nan) F2: (test=nan) total time=   0.7s


Traceback (most recent call last):
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 388, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\base.py", line 77, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
  File "C:\Users\JulesBo

[CV 3/5] END u__n_neighbors=1, u__sampling_strategy=0.005, u__version=1; AUC: (test=nan) F1: (test=nan) F2: (test=nan) total time=   0.8s


Traceback (most recent call last):
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 262, in fit
    Xt, yt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 220, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\pipeline.py", line 388, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
  File "C:\Users\JulesBoutibou\anaconda3\lib\site-packages\imblearn\base.py", line 77, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
  File "C:\Users\JulesBo

KeyboardInterrupt: 

In [65]:
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid_res.cv_results_['mean_test_score']
stds = grid_res.cv_results_['std_test_score']
params = grid_res.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.004924 using {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.005, 'u__version': 1}
0.004924 (0.000028) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.005, 'u__version': 1}
0.002851 (0.000339) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.005, 'u__version': 2}
0.004166 (0.000130) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.01, 'u__version': 1}
0.002682 (0.000158) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.01, 'u__version': 2}
0.003739 (0.000074) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.05, 'u__version': 1}
0.002999 (0.000223) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.05, 'u__version': 2}
0.003734 (0.000071) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.1, 'u__version': 1}
0.003174 (0.000086) with: {'u__n_neighbors_ver3': 1, 'u__sampling_strategy': 0.1, 'u__version': 2}
0.004924 (0.000028) with: {'u__n_neighbors_ver3': 2, 'u__sampling_strategy': 0.005, 'u__version': 1}
0.0

In [58]:
grid.best_score_

0.003453805056233165