# Impayés carte bancaire

On cherche à prédire des fraudes ou impayés à la carte bancaire


## Librairies et fonctions utiles

In [None]:
# Directive pour afficher les graphiques dans Jupyter
%matplotlib inline

In [None]:
# Pandas : librairie de manipulation de données
# NumPy : librairie de calcul scientifique
# MatPlotLib : librairie de visualisation et graphiques
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from IPython.core.display import HTML # permet d'afficher du code html dans jupyter

Fonction pour standardiser les données quantitatives (cont_feat est une liste des colonnes correspondant à des caractéristiques quantitatives) :

In [None]:
def scale_feat(df,cont_feat) :
    df1=df
    scaler = preprocessing.RobustScaler()
    df1[cont_feat] = scaler.fit_transform(df1[cont_feat])
    scaler = preprocessing.StandardScaler()
    df1[cont_feat] = scaler.fit_transform(df1[cont_feat]) 
    return df1

Fonction pour tracer les courbes d'apprentissage sur l'ensemble d'apprentissage et l'ensemble de validation :

In [None]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(est, X_train, y_train) :
    train_sizes, train_scores, test_scores = learning_curve(estimator=est, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=5,
                                                        n_jobs=-1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(8,10))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
    plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
    plt.grid(b='on')
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0.6, 1.0])
    plt.show()

Fonction pour tracer la courbe ROC :

In [None]:
def plot_roc_curve(est,X_test,y_test) :
    probas = est.predict_proba(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.figure(figsize=(8,8))
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')        # plus mauvaise courbe
    plt.plot([0,0,1],[0,1,1],'g:')     # meilleure courbe
    plt.xlim([-0.05,1.2])
    plt.ylim([-0.05,1.2])
    plt.ylabel('Taux de vrais positifs')
    plt.xlabel('Taux de faux positifs')
    plt.show

Fonction pour équilibrer un dataframe *df* sur la colonne cible *target_col* avec la classe minoritaire *minority_class* :

In [None]:
def undersample(df, target_col, minority_class) :
    df_minority = df[df[target_col] == minority_class]
    df_majority = df.drop(df_minority.index)
    ratio=len(df_minority)/len(df_majority)
    df_majority = df_majority.sample(frac=ratio)
    df1 = pd.concat((df_majority,df_minority), axis=0)
    return df1.sample(frac=1)

## Traitement du dataset

In [None]:
df = pd.read_csv("../input/creditcard_uci.csv", sep=';')

In [None]:
df.head()

Ce dataset contient des informations sur les défauts de paiement, les facteurs démographiques, les données de crédit, l'historique des paiements et les relevés de factures des clients de cartes de crédit à Taiwan d'avril 2005 à septembre 2005.

- ID: ID of each client
- LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
- SEX: Gender (1=male, 2=female)
- EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
- MARRIAGE: Marital status (1=married, 2=single, 3=others)
- AGE: Age in years
- PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
- PAY_2: Repayment status in August, 2005 (scale same as above)
- PAY_3: Repayment status in July, 2005 (scale same as above)
- PAY_4: Repayment status in June, 2005 (scale same as above)
- PAY_5: Repayment status in May, 2005 (scale same as above)
- PAY_6: Repayment status in April, 2005 (scale same as above)
- BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
- BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
- BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
- BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
- BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
- BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
- PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
- PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
- PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
- PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
- PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
- PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
- **CLASS: Default payment next month (1=yes, 0=no)**

In [None]:
df.info()

## Exercice : prédire le défaut de paiement

In [None]:
df.columns

In [None]:
df = df.drop('ID', axis=1)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#sns.pairplot(df, hue="CLASS")

In [None]:
plt.hist(df["PAY_AMT1"], bins=20)

In [None]:
plt.hist(df["PAY_AMT2"], bins=20)

In [None]:
X = df.drop(['CLASS'], axis=1)
y = df['CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [None]:
def runClassifier(est, X_train, y_train, X_test, y_test):
    est.fit(X_train,y_train)
    y_est = est.predict(X_test)
    
    est_score = metrics.accuracy_score(y_test, y_est)
    print("Accuracy score :", est_score)

    class_score = metrics.classification_report(y_test, y_est)
    print("Classification report :", class_score)

    cm = metrics.confusion_matrix(y_test, y_est)
    print("Confusion matrix :", cm)
    
    probas = est.predict_proba(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print("Roc :", roc_auc)
    
    plot_roc_curve(est,X_test,y_test)
    plot_learning_curve(est, X_train, y_train)
    
    return (est, y_est, est_score, class_score, cm, probas, roc_auc)

## Logistic Regression

In [None]:
lr = LogisticRegression()

runClassifier(lr, X_train, y_train, X_test, y_test)

## Random Forest

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()

runClassifier(rf, X_train, y_train, X_test, y_test)

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8,5))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), X_train.columns[indices])
plt.title('Importance des caracteristiques')

## Under sampling -> Logistric Regression

In [None]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks(return_indices=True, ratio='majority')
X_tl, y_tl, i_tl = tl.fit_sample(X_train, y_train)

lrus = LogisticRegression()
lrus.fit(X_tl, y_tl)
y_lrus = lrus.predict(X_test)

print(metrics.classification_report(y_test, y_lrus))
cm = metrics.confusion_matrix(y_test, y_lrus)
print(cm)

probas = lrus.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
plot_learning_curve(lrus, X_train, y_train)
plot_roc_curve(lrus,X_test,y_test)

## Over sampling -> Logistic Regression

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')
X_smote, y_smote = smote.fit_sample(X_train, y_train)

lros = LogisticRegression()
lros.fit(X_smote, y_smote)
y_lros = lros.predict(X_test)

print(metrics.classification_report(y_test, y_lros))
cm = metrics.confusion_matrix(y_test, y_lros)
print(cm)

probas = lros.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)


In [None]:
plot_learning_curve(lros, X_train, y_train)
plot_roc_curve(lros,X_test,y_test)

## Under and Over Sampling -> Logistric Regression

In [None]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(ratio='auto')
X_smt, y_smt = smt.fit_sample(X_train, y_train)

lrboth = LogisticRegression()
lrboth.fit(X_smt, y_smt)
y_lrboth = lrboth.predict(X_test)

print(metrics.classification_report(y_test, y_lrboth))
cm = metrics.confusion_matrix(y_test, y_lrboth)
print(cm)

probas = lrboth.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:, 1])
roc_auc = auc(false_positive_rate, true_positive_rate)
print (roc_auc)

In [None]:
plot_learning_curve(lrboth, X_train, y_train)
plot_roc_curve(lrboth,X_test,y_test)

## XGBoost

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train,y_train)
print(xgb.score(X_test,y_test))

In [None]:
y_xgb = xgb.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_xgb)
print(cm)
xgb_score = metrics.accuracy_score(y_test, y_xgb)
print(xgb_score)

In [None]:
plot_learning_curve(xgb, X_train, y_train)
plot_roc_curve(xgb,X_test,y_test)

In [None]:
print(classification_report(y_test, y_xgb))

## Grid Search

### Random Forest

In [None]:
param_grid = {
              'n_estimators': [10, 100, 500],
              'min_samples_leaf': [1, 20, 50]
             }
estimator = ensemble.RandomForestClassifier()
rf_gs = model_selection.GridSearchCV(estimator, param_grid)

rf_gs.fit(X_train, y_train)

print(rf_gs.best_params_)

rfbest = rf_gs.best_estimator_

runClassifier(rfbest, X_train, y_train, X_test, y_test)