In [18]:
import numpy as np
import pandas as pd

In [19]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [20]:
path_file = 'C:/Users/JulesBoutibou/Documents/Perso/PIP/DATA/Data_G2/Donnees_v1.csv'
df = pd.read_csv(path_file)

In [21]:
df['TAUX_LECTURE_MS_6M']=df['TAUX_LECTURE_MS_6M'].str.rstrip('%').astype('float') / 100.0
df['TAUX_LECTURE_PAP_6M']=df['TAUX_LECTURE_PAP_6M'].str.rstrip('%').astype('float') / 100.0
df['TAUX_LECTURE_EMAIL_6M']=df['TAUX_LECTURE_EMAIL_6M'].str.rstrip('%').astype('float') / 100.0

In [22]:
df2 = df.sample(frac = 0.1)
df2.TOP_FRAUDE.value_counts()

0    84796
1      145
Name: TOP_FRAUDE, dtype: int64

In [23]:
X = df2.drop(columns = ['TOP_FRAUDE_VIREMENT','TOP_FRAUDE_CARTE','TOP_FRAUDE','Unnamed: 0', 'ID'])
y = df2.TOP_FRAUDE

# Sampling

In [24]:
from imblearn.under_sampling import NearMiss

In [48]:
params_nearmiss = {
    'u__version': [1, 2],
    'u__sampling_strategy': [0.01, 0.1, 0.25, 0.4],
    'u__n_neighbors_ver3': [3, 6, 9]
}

### SVMSMOTE (Only oversampling)

In [26]:
from imblearn.over_sampling import SVMSMOTE
from sklearn.svm import SVC

In [27]:
params_svmsmote = {
    'o__sampling_strategy': [0.005, 0.01, 0.03, 0.1, 0.4],
    'o__k_neighbors': [3, 16, 30],
    'o__m_neighbors': [3, 16, 30],
    'o__svm_estimator': [SVC(kernel='poly'), SVC(kernel='poly', degree=6), SVC(kernel='sigmoid'), SVC(kernel='rbf')]
}

params_svmsmote = {
    'o__sampling_strategy': [0.1, 0.01],
    'o__k_neighbors': [5, 15],
    'o__m_neighbors': [5],
    'o__svm_estimator': [SVC(kernel='sigmoid')]
}

### SMOTE and CC

In [28]:
# combined SMOTE and Edited Nearest Neighbors sampling for imbalanced classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import ClusterCentroids 

In [29]:
params_smotecc = {
    'o__sampling_strategy': [0.1],
    'o__k_neighbors': [10],
    'u__sampling_strategy': [0.2],
    'u__n_jobs': [-1]
}

## Testing models

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [31]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#### Initialize pipelines

In [54]:
# Model
#model = LogisticRegression(C=0.1, class_weight= 'balanced',  max_iter = 500)
model = XGBClassifier(objective='binary:hinge', use_label_encoder=False)

# Oversamplers
over_smote = SMOTE()
over_svmsmote = SVMSMOTE()

# Undersamplers
under_nearmiss = NearMiss()
under_enn = EditedNearestNeighbours()
under_cc = ClusterCentroids()

# Pipelines
pipeline_nearmiss = Pipeline([('u', under_nearmiss), ('m', model)])
pipeline_svmsmote = Pipeline([('o', over_svmsmote), ('m', model)])
pipeline_smotecc = Pipeline([('o', over_smote),('u', under_cc), ('m', model)])

In [51]:
scoring = {"AUC": "roc_auc", "F1": 'f1'}

In [52]:
Xsampled = X.iloc[:,:10]
Xsampled.shape

(84941, 10)

In [59]:
grid = GridSearchCV(pipeline_nearmiss, params_nearmiss, scoring='f1', verbose=3, refit = 'f1', cv=2)
grid_res = grid.fit(X, y)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV 1/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.01, u__version=1; total time=   2.6s
[CV 2/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.01, u__version=1; total time=   2.7s
[CV 1/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.01, u__version=2; total time=   4.3s
[CV 2/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.01, u__version=2; total time=   4.4s
[CV 1/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.1, u__version=1; total time=   1.3s
[CV 2/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.1, u__version=1; total time=   1.3s
[CV 1/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.1, u__version=2; total time=   1.5s
[CV 2/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.1, u__version=2; total time=   1.6s
[CV 1/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.25, u__version=1; total time=   1.3s
[CV 2/2] END u__n_neighbors_ver3=3, u__sampling_strategy=0.25, u__version=1; to

In [None]:
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid_res.cv_results_['mean_test_score']
stds = grid_res.cv_results_['std_test_score']
params = grid_res.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.003454 using {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.25, 'u__version': 2}
0.003196 (0.000612) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.01, 'u__version': 1}
0.003020 (0.000129) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.01, 'u__version': 2}
0.003360 (0.000125) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.1, 'u__version': 1}
0.003399 (0.000053) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.1, 'u__version': 2}
0.003365 (0.000160) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.25, 'u__version': 1}
0.003454 (0.000023) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.25, 'u__version': 2}
0.003348 (0.000183) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.4, 'u__version': 1}
0.003454 (0.000023) with: {'u__n_neighbors_ver3': 3, 'u__sampling_strategy': 0.4, 'u__version': 2}
0.003196 (0.000612) with: {'u__n_neighbors_ver3': 6, 'u__sampling_strategy': 0.01, 'u__version': 1}
0.003020 

In [58]:
grid.best_score_

0.003453805056233165