In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from imblearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [3]:
path_file = 'C:/Users/JulesBoutibou/Documents/Perso/PIP/DATA/Data_G2/Donnees_v1.csv'
df = pd.read_csv(path_file)

In [4]:
df['TAUX_LECTURE_MS_6M']=df['TAUX_LECTURE_MS_6M'].str.rstrip('%').astype('float') / 100.0
df['TAUX_LECTURE_PAP_6M']=df['TAUX_LECTURE_PAP_6M'].str.rstrip('%').astype('float') / 100.0
df['TAUX_LECTURE_EMAIL_6M']=df['TAUX_LECTURE_EMAIL_6M'].str.rstrip('%').astype('float') / 100.0

In [5]:
df2 = df.sample(frac = 0.1)
df2.TOP_FRAUDE.value_counts()

0    84789
1      152
Name: TOP_FRAUDE, dtype: int64

In [14]:
X = df2.drop(columns = ['TOP_FRAUDE_VIREMENT','TOP_FRAUDE_CARTE','TOP_FRAUDE','Unnamed: 0', 'ID'])
y = df2.TOP_FRAUDE

# Sampling

In [6]:
from imblearn.under_sampling import NearMiss

In [7]:
params_nearmiss2 = {
    'u__version': [2],
    'u__sampling_strategy': [0.01, 0.03, 0.1, 0.4],
    'u__n_neighbors_ver3': [3, 9, 16]
}

### SVMSMOTE (Only oversampling)

In [8]:
from imblearn.over_sampling import SVMSMOTE
from sklearn.svm import SVC

In [9]:
params_svmsmote = {
    'o__sampling_strategy': [0.005, 0.01, 0.03, 0.1, 0.4],
    'o__k_neighbors': [3, 16, 30],
    'o__m_neighbors': [3, 16, 30],
    'o__svm_estimator': [SVC(kernel='poly'), SVC(kernel='poly', degree=6), SVC(kernel='sigmoid'), SVC(kernel='rbf')]
}

params_svmsmote = {
    'o__sampling_strategy': [0.1, 0.01],
    'o__k_neighbors': [5, 15],
    'o__m_neighbors': [5],
    'o__svm_estimator': [SVC(kernel='sigmoid')]
}

### SMOTE and CC

In [10]:
# combined SMOTE and Edited Nearest Neighbors sampling for imbalanced classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import ClusterCentroids 

In [11]:
params_smotecc = {
    'o__sampling_strategy': [0.1],
    'o__k_neighbors': [10],
    'u__sampling_strategy': [0.2],
    'u__n_jobs': [-1]
}

## Testing models

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [13]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#### Initialize pipelines

In [20]:
# Model
#model = LogisticRegression(C=0.1, class_weight= 'balanced',  max_iter = 500)
model = XGBClassifier(max_depth=8, class_weight='balanced',objective='binary:hinge', use_label_encoder=False)

# Oversamplers
over_smote = SMOTE()
over_svmsmote = SVMSMOTE()

# Undersamplers
under_nearmiss = NearMiss()
under_enn = EditedNearestNeighbours()
under_cc = ClusterCentroids()

# Pipelines
pipeline_nearmiss3 = Pipeline([('u', under_nearmiss), ('m', model)])
pipeline_svmsmote = Pipeline([('o', over_svmsmote), ('m', model)])
pipeline_smotecc = Pipeline([('o', over_smote),('u', under_cc), ('m', model)])

In [21]:
scoring = {"AUC": "roc_auc", "F1": 'f1'}

In [22]:
Xsampled = X.iloc[:,:5]
Xsampled.shape

(84941, 50)

In [23]:
grid = GridSearchCV(pipeline_smotecc, params_smotecc, scoring='f1', verbose=2, refit = 'f1', cv=2)
grid_res = grid.fit(Xsampled, y)

Fitting 4 folds for each of 2 candidates, totalling 8 fits


In [32]:
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid_res.cv_results_['mean_test_score']
stds = grid_res.cv_results_['std_test_score']
params = grid_res.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.037135 using {'o__k_neighbors': 5, 'o__m_neighbors': 5, 'o__sampling_strategy': 0.01, 'o__svm_estimator': SVC(kernel='sigmoid')}


KeyError: 'mean_test_score'

In [34]:
grid.best_score_

0.03713527851458886