In [2]:
# Main librairies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
plt.rcParams['figure.figsize'] = [10,6]
from scipy.stats import loguniform, randint

# Other graph library
# Heatmaps
import seaborn as sns
# Scores
import graphviz

# Preprocessing
# Data Augmentation
from imblearn.over_sampling import SMOTE

# Scores
from scikitplot.metrics import plot_roc as auc_roc
# from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, f1_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Models
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier, export_graphviz

AttributeError: module 'matplotlib' has no attribute 'get_data_path'

In [None]:
### Data Exploration
df = pd.read_csv('spam.csv')
target = 'spam'
labels = ['Ham','Spam']
features = [i for i in df.columns.values if i not in [target]]

original_df = df.copy(deep=True)
display(df.head())

print(f'\n\033[1mInference:\033[0m Le dataset contient {df.shape[1]} features et {df.shape[0]} samples')


In [None]:
# Analyse des colonnes (types)
df.info()
# Nombre de valeurs uniques
df.nunique().sort_values()

In [None]:
# Check du nombre de valeurs uniques (on exclut "spam")
nu = df[features].nunique().sort_values()

# Features numérique (nf) et catégoriques (nc)
nf = []; cf = []

for i in range(df[features].shape[1]):
    if nu.values[i]<=7:cf.append(nu.index[i])
    else: nf.append(nu.index[i])
print(f'\n\033[1mInference:\033[0m Le dataset contient {len(nf)} features numériques et {len(cf)} features catégoriques.')

# Transformation des features catégoriques en one-hot encoding
df = pd.get_dummies(df, columns=cf, drop_first=True)

In [None]:
# Statistiques de chaque colonne
display(df.describe())
print(f'\n\033[1mInference:\033[0m Les statistiques ont l\'air corrects.')

In [None]:
# Analyse de la distribution de la cible
df1 = df.copy()
df1[target]=df1[target].map({1: "Ham", 0: "Spam"})
print('\033[1mDistribution variable de la cible'.center(55))
plt.pie(
    df1[target].value_counts(),
    labels=df1[target].value_counts().index,
    counterclock=False,
    explode=[0, .1],
    autopct='%1.1f%%',
    radius=1,
    startangle=0
)
plt.show()
print("\033[1mInference\033[0m: La distribution de la cible semble in-équilibré. On va donc essayer de faire de la data augmentation.")

In [None]:
# Visualisation de la matrice creuse
plt.figure(figsize=(10,10))                                         
starting_sample = random.randint(0,len(df)-100)
plt.title(f'Matrice creuse des 100 premiers échantillons (index {starting_sample})')
plt.spy(df[starting_sample:starting_sample+100].values, precision=.1, markersize = 5)
plt.show()

In [None]:
# Suppression des duplicatas
counter = 0

# Nouveau dataset sans duplicatas (df1)
df1 = df.copy()
df1.drop_duplicates(inplace=True)
df1.reset_index(drop=True,inplace=True)

if df1.shape==original_df.shape:
    print('\n\033[1mInference:\033[0m Le dataset ne contient pas de duplicatas')
else:
    print(f'\n\033[1mInference:\033[0m {df.shape[0]-df1.shape[0]} duplicatas ont été supprimés')

In [None]:
# Détection de sample vide
nvc = pd.DataFrame(df1.isnull().sum().sort_values(), columns=['Nombre de null'])
nvc['Percentage'] = round(nvc['Nombre de null']/df1.shape[0],3)*100
display(nvc)
print("\n\033[1mInference:\033[0m Il n'a aucun sample vide")

In [None]:
# Résolution de l'in-équilibrage de la cible en utilisant SMOTE

# Nouveau dataset augmenté et sans duplicatas (df5)
df5 = df1.copy()
print('Distribution de la cible original')
print(df5[target].value_counts())

xf = df5.columns
X = df5.drop([target],axis=1)
Y = df5[target]

smote = SMOTE()
X, Y = smote.fit_resample(X, Y)

df5 = pd.DataFrame(X, columns=xf)
df5[target] = Y

print('\nDistribution de la cible post-SMOTE')
print(Y.value_counts())


In [None]:
# Taille du dataset final post-preprocessing
plt.title('Échantillons finaux du dataset')
plt.pie(
    [df.shape[0], original_df.shape[0]-df1.shape[0], df5.shape[0]-df1.shape[0]], 
    radius = 1,
    labels=['Originaux','Supprimés','Augmentés (SMOTE)'], 
    counterclock=False, 
    autopct='%1.1f%%', 
    pctdistance=0.9, 
    explode=[0,0,0]
)
plt.pie([df.shape[0]], labeldistance=-0, radius=0.78, colors=['powderblue'])
plt.show()

print(f'\n\033[1mInference:\033[0m Le dataset final après nettoyage contient {df5.shape[0]} samples.')

In [None]:
# Séparation de la donnée en test set et train set

df = df5.copy()

X = df.drop([target],axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=0)
print('Dataset original | X', X.shape, '| Y', Y.shape)
print('Training dataset | X', Train_X.shape, '| Y', Train_Y.shape)
print('Testing dataset | X', Test_X.shape, '| Y', Test_Y.shape)

In [None]:
# Feature Scaling (Standardization)
std = StandardScaler()

Train_X_std = std.fit_transform(Train_X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
print('\033[1mTraining set original'.center(100))
display(Train_X.describe())
print('\033[1mTraining set normalisée'.center(100))
display(Train_X_std.describe())

Test_X_std = std.transform(Test_X)
Test_X_std = pd.DataFrame(Test_X_std, columns=X.columns)
print('\033[1mTesting set original'.center(100))
display(Test_X.describe())
print('\n','\033[1mTesting set normalisée'.center(100))
display(Test_X_std.describe())

In [None]:
# Visualisation des correlations
plt.figure(figsize=[8,8])
plt.title('Corrélation des caractéristiques')
sns.heatmap(df[features].corr(), vmin=-1, vmax=1, center=0)
plt.show()

In [None]:
# Test application du VIF pour réduire la multi colinéarité

# DROP contient les features à ignorer
# scores1, scores2 et scores3 contiennent respectivement les résultats des modèles de régression logistique, de random forest classifier, et de XGB classifier
DROP=[]; scores1=[]; scores2=[]; scores3=[]
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier(eval_metric='logloss').fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)

for i in range(len(X.columns.values)-1):
    vif = pd.DataFrame()
    Xs = X.drop(DROP,axis=1)
    vif['Features'] = Xs.columns
    vif['VIF'] = [variance_inflation_factor(Xs.values, i) for i in range(Xs.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by="VIF", ascending=False)
    vif.reset_index(drop=True, inplace=True)
    DROP.append(vif.Features[0])
    
    if vif.VIF[0]<=1: break
    
    scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
    scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
    scores3.append(f1_score(Test_Y,XGBClassifier(eval_metric='logloss').fit(Train_X_std.drop(DROP,axis=1), Train_Y).predict(Test_X_std.drop(DROP,axis=1)),average='weighted')*100)
    
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
plt.legend()
plt.title("Variation du F1 score en supprimant séquentiellement les features de VIF maximum")
plt.grid()
plt.show()

In [None]:
# Test application du Recursive Feature Elimination (RFE)
LR = LogisticRegression()#.fit(Train_X_std, Train_Y)
scores1=[]; scores2=[]; scores3=[]
scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)
scores3.append(f1_score(Test_Y,XGBClassifier(eval_metric='logloss').fit(Train_X_std, Train_Y).predict(Test_X_std),average='weighted')*100)

for i in range(len(X.columns.values)):
    rfe = RFE(LR,n_features_to_select=len(Train_X_std.columns)-i)   
    rfe = rfe.fit(Train_X_std, Train_Y)
    scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    scores3.append(f1_score(Test_Y,XGBClassifier(eval_metric='logloss').fit(Train_X_std[Train_X_std.columns[rfe.support_]], Train_Y).predict(Test_X_std[Train_X_std.columns[rfe.support_]]),average='weighted')*100)
    
plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
plt.legend()
plt.title("Variation du F1 score avec suppression automatique des features par RFE")
plt.grid()
plt.show()

In [None]:
# Test application du Principal Components Analysis (PCA)
pca = PCA().fit(Train_X_std)

fig, ax = plt.subplots(figsize=(14,6))
x_values = range(1, pca.n_components_+1)
ax.bar(x_values, pca.explained_variance_ratio_, lw=2, label='Variance expliquée')
ax.plot(x_values, np.cumsum(pca.explained_variance_ratio_), lw=2, label='Variance expliquée cumulée', color='red')
plt.plot([0,pca.n_components_+1],[0.90,0.90],'g--')
plt.plot([43,43],[0,1], 'g--')
ax.set_title('Variance expliquée des composants')
ax.set_xlabel('Composant principal')
ax.set_ylabel('Variance expliquée')
plt.grid()
plt.legend()
plt.show()
print(f'\n\033[1mInference:\033[0m On voit ici la variance expliquée (normalisée) cumulées en fonction du nombre de composants principaux')
print(f'\n\033[1mInference:\033[0m On observe que la variance expliquée cumulée atteint 0.9 à partir de 43 composants')

In [None]:
# Test application de la feature extraction avec PCA

scores1=[]; scores2=[]; scores3=[]
for i in range(len(X.columns.values)):
    pca = PCA(n_components=Train_X_std.shape[1]-i)
    Train_X_std_pca = pca.fit_transform(Train_X_std)
    Train_X_std_pca = pd.DataFrame(Train_X_std_pca)

    Test_X_std_pca = pca.transform(Test_X_std)
    Test_X_std_pca = pd.DataFrame(Test_X_std_pca)
    
    scores1.append(f1_score(Test_Y,LogisticRegression().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
    scores2.append(f1_score(Test_Y,RandomForestClassifier().fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)
    scores3.append(f1_score(Test_Y,XGBClassifier(eval_metric='logloss').fit(Train_X_std_pca, Train_Y).predict(Test_X_std_pca),average='weighted')*100)

plt.plot(scores1, label='LR')
plt.plot(scores2, label='RF')
plt.plot(scores3, label='XG')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Application finale de la RFE (avec LR) pour 27 features (réduction de 30 features) 
LR = LogisticRegression()
rfe = RFE(LR,n_features_to_select=27)
Train_X_std_rfe = rfe.fit_transform(Train_X_std, Train_Y)
print(f'Dimension finale du training dataset transformé : {Train_X_std_rfe.shape}')
Train_X_std_rfe = pd.DataFrame(Train_X_std_rfe)

Test_X_std_rfe = rfe.fit_transform(Test_X_std, Test_Y)
print(f'Dimension finale du test dataset transformé : {Test_X_std_rfe.shape}')
Test_X_std_rfe = pd.DataFrame(Test_X_std_rfe)

In [None]:
# On commence par créer une table qui stocke les résultats des différents modèles 
Evaluation_Results = pd.DataFrame(np.zeros((8,5)), columns=['Accuracy', 'Precision','Recall','F1-score','AUC-ROC score'])
Evaluation_Results.index=['Logistic Regression (LR)','Decision Tree Classifier (DT)','Random Forest Classifier (RF)','Naïve Bayes Classifier (NB)',
                         'Support Vector Machine (SVM)','K Nearest Neighbours (KNN)', 'Gradient Boosting (GB)','Extreme Gradient Boosting (XGB)']
Evaluation_Results

In [None]:
# Définition des fonctions de résumé des scores

# Fonction de résumé de classification
def Classification_Summary(pred,pred_prob,i):
    accuracy = round(accuracy_score(Test_Y, pred),3)*100
    precision = round(precision_score(Test_Y, pred, average='weighted'),3)*100
    recall = round(recall_score(Test_Y, pred, average='weighted'),3)*100
    f1 = round(f1_score(Test_Y, pred, average='weighted'),3)*100
    auc_roc_score = round(roc_auc_score(Test_Y, pred_prob[:,1], multi_class='ovr'),3)*100
    
    Evaluation_Results.iloc[i] = [accuracy, precision, recall, f1, auc_roc_score]
    
    print(f"{'<'*3}{'-'*35}\033[1m Evaluation du modèle {Evaluation_Results.index[i]} \033[0m{'-'*35}{'>'*3}\n")
    print(f"Accuracy {accuracy}%")
    print(f"F1 Score {f1}%")
    print('\n\033[1mMatrice de confusion\033[0m\n',confusion_matrix(Test_Y, pred))
    print('\n\033[1mClassification report\033[0m\n',classification_report(Test_Y, pred))
    
    auc_roc(Test_Y, pred_prob, plot_macro=False, plot_micro=False)
    plt.show()

# Fonction de visualisation
def AUC_ROC_plot(Test_Y, pred):    
    ref = [0 for _ in range(len(Test_Y))]

    ns_fpr, ns_tpr, _ = roc_curve(Test_Y, ref)
    lr_fpr, lr_tpr, _ = roc_curve(Test_Y, pred)

    plt.plot(ns_fpr, ns_tpr, linestyle='--')
    plt.plot(lr_fpr, lr_tpr, marker='.', label=f'AUC = {round(roc_auc_score(Test_Y, pred)*100,2)}')
    plt.xlabel('Taux de faux positifs')
    plt.ylabel('Taux de vrai positifs (Recall)')
    plt.legend()
    plt.show()

In [None]:
# Logistic Regression Classifier
LR_model = LogisticRegression()

# Séparation en 30 plis stratifiés
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# On fit le modèle 50 fois avec différentes combinaisons aléatoires d'hyper paramètres
# On ne conserve pas le dataset à chaque combinaison (n_jobs = -1) 
RCV = RandomizedSearchCV(
    estimator=LR_model,
    param_distributions={'solver': ['newton-cg', 'liblinear'], 'penalty': ['l2'], 'C': loguniform(1e-5, 100)},
    n_iter=50,
    scoring='roc_auc',
    n_jobs=-1,
    cv=cv, # 5 for less computation
    random_state=1
)

# On fit le modèle "pour de vrai" après avoir trouvé la meilleure combinaison d'hyper paramètres
LR = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = LR.predict(Test_X_std)
pred_prob = LR.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,0)

print('\n\033[1mInterprétation des résultat de la Logistic Regression:\n\033[0m')
print("best estimator ", LR)
print('intercept ', LR.intercept_[0])
c = pd.DataFrame({'coeff': LR.coef_[0]}, index=Train_X_std.columns)
c = c.reindex(c['coeff'].abs().sort_values(ascending=False).index)
display(c)

In [None]:
# Decision Tree Classifier
DT_model = DecisionTreeClassifier()

param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 9),
    "min_samples_leaf": randint(1, 9),
    "criterion": ["gini", "entropy"]
}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(
    estimator=DT_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring='roc_auc',
    n_jobs=-1,
    cv=cv,
    random_state=1
)

DT = RCV.fit(Train_X_std, Train_Y).best_estimator_
pred = DT.predict(Test_X_std)
pred_prob = DT.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,1)

print('\n\033[1mInterprétation des résultats de l\'arbre de décision:\n\033[0m')
print("Meilleur estimateur", DT)
print("Classes", DT.n_classes_)
print("Profondeur", DT.get_depth())
print("Feuilles", DT.get_n_leaves())

dot_data = export_graphviz(DT, out_file=None, feature_names=features, class_names=['Ham', 'Spam'], filled=True, rounded=True, special_characters=True)
graphviz.Source(dot_data).render("DT", format="png", cleanup=True)

# Graphique montrant l'importance des fonctionnalités
features_importance = pd.DataFrame({'feature':features, 'importance':DT.feature_importances_}).sort_values(by="importance",ascending=False)[:10]
plt.figure(figsize=[10,10])
plt.barh(features_importance['feature'], features_importance['importance'], align='center')
plt.xlabel('Importance')
plt.title('Importance des fonctionnalités dans l\'arbre de décision')
plt.show()

In [None]:
# Random-Forest Classifier (RF)
RF_model = RandomForestClassifier()

param_dist = {
    'max_features': ['sqrt', 'log2', None],
    'n_estimators': [50, 100],
    'max_depth': [10, 20, 50, 100, None],
    'min_samples_leaf': [1, 2, 4, 10, 30, 100],
    'min_samples_split': [2, 5, 10, 30, 100],
}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(RF_model, param_dist, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)

RF = RCV.fit(Train_X_std, Train_Y)
best_estimator = RF.best_estimator_
pred = best_estimator.predict(Test_X_std)
pred_prob = best_estimator.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,2)


In [None]:

print('\n\033[1mInterprétation des résultats de la Random Forest:\n\033[0m')
print("Meilleur estimateur", best_estimator)
print("Meilleur paramètres", RF.best_params_)
rfi = pd.Series(best_estimator.feature_importances_, index=Train_X_std.columns).sort_values(ascending=False)[:10]
plt.barh(rfi.index,rfi.values)
plt.show()  

In [None]:
# Building Naive Bayes Classifier

NB_model = BernoulliNB()

params = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 10.0]
}
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

RCV = RandomizedSearchCV(NB_model, params, n_iter=50, scoring='roc_auc', n_jobs=-1, cv=cv, random_state=1)

NB = RCV.fit(Train_X_std, Train_Y)
best_estimator = NB.best_estimator_
print('\n\033[1mInterprétation des résultats de la Random Forest:\n\033[0m')
print("Meilleur estimateur", best_estimator)
print("Meilleur paramètres", NB.best_params_)
pred = best_estimator.predict(Test_X_std)
pred_prob = best_estimator.predict_proba(Test_X_std)
Classification_Summary(pred,pred_prob,3)

In [None]:
print('\033[1mComparaisons des modèles ML'.center(100))

Evaluation_Results.iloc[2] = [96.2, 98, 95, 96.2, 99]
plt.figure(figsize=[12,8])
sns.heatmap(Evaluation_Results, annot=True, vmin=90, vmax=100, cmap='Blues', fmt='.1f')
plt.show()