# 0 Introductions

## 0.1 Imports

In [None]:
# Import du mondule drive d'authentification (précisément permet de mount le notebook a la racine d'un drive donné)
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Import des packages pour le projet
# Les packages usuelles
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from math import ceil, floor
import pickle

# Les package de machine learning
# Les packages de clustering
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelBinarizer
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer,InterclusterDistance,SilhouetteVisualizer

# Les packages de check features importance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV, LinearRegression
from sklearn.cluster import KMeans

#les packages de metrics

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,fbeta_score,roc_auc_score,roc_curve,confusion_matrix,adjusted_rand_score

# Les packages de visualisations
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
# Création de la variable du dossier de stockage des fichiers du projet
url_file =Path('/content/gdrive/MyDrive/Colab Notebooks/OC_projects/dan_p10/archives')
os.listdir(url_file)

In [None]:
test_data=pd.read_csv(url_file/'billets_test.csv')

In [None]:
data=pd.read_csv(url_file/'billets.csv',sep=';').dropna()

In [None]:
data.describe()

## 0.2 Fonctions customs

In [None]:
def display_circles(pcs, n_comp, pca, axis_ranks, labels=None, label_rotation=0, lims=None):
    for d1, d2 in axis_ranks: # On affiche les 3 premiers plans factoriels, donc les 6 premières composantes
        if d2 < n_comp:

            # initialisation de la figure
            fig, ax = plt.subplots(figsize=(7,6))

            # détermination des limites du graphique
            if lims is not None :
                xmin, xmax, ymin, ymax = lims
            elif pcs.shape[1] < 30 :
                xmin, xmax, ymin, ymax = -1, 1, -1, 1
            else :
                xmin, xmax, ymin, ymax = min(pcs[d1,:]), max(pcs[d1,:]), min(pcs[d2,:]), max(pcs[d2,:])

            # affichage des flèches
            # s'il y a plus de 30 flèches, on n'affiche pas le triangle à leur extrémité
            if pcs.shape[1] < 30 :
                plt.quiver(np.zeros(pcs.shape[1]), np.zeros(pcs.shape[1]),
                   pcs[d1,:], pcs[d2,:], 
                   angles='xy', scale_units='xy', scale=1, color="grey")
                # (voir la doc : https://matplotlib.org/api/_as_gen/matplotlib.pyplot.quiver.html)
            else:
                lines = [[[0,0],[x,y]] for x,y in pcs[[d1,d2]].T]
                ax.add_collection(LineCollection(lines, axes=ax, alpha=.1, color='black'))
            
            # affichage des noms des variables  
            if labels is not None:  
                for i,(x, y) in enumerate(pcs[[d1,d2]].T):
                    if x >= xmin and x <= xmax and y >= ymin and y <= ymax :
                        plt.text(x, y, labels[i], fontsize='14', ha='center', va='center', rotation=label_rotation, color="blue", alpha=0.5)
            
            # affichage du cercle
            circle = plt.Circle((0,0), 1, facecolor='none', edgecolor='b')
            plt.gca().add_artist(circle)

            # définition des limites du graphique
            plt.xlim(xmin, xmax)
            plt.ylim(ymin, ymax)
        
            # affichage des lignes horizontales et verticales
            plt.plot([-1, 1], [0, 0], color='grey', ls='--')
            plt.plot([0, 0], [-1, 1], color='grey', ls='--')

            # nom des axes, avec le pourcentage d'inertie expliqué
            plt.xlabel('F{} ({}%)'.format(d1+1, round(100*pca.explained_variance_ratio_[d1],1)))
            plt.ylabel('F{} ({}%)'.format(d2+1, round(100*pca.explained_variance_ratio_[d2],1)))

            plt.title("Cercle des corrélations (F{} et F{})".format(d1+1, d2+1))
            plt.show(block=False)

In [None]:
def aff_confusion_matrix(df_true, df_pred):

    conf_mat = confusion_matrix(df_true, df_pred)

    print('Confusion matrix:\n', conf_mat)


    labels = ['Class 0', 'Class 1']
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(conf_mat, cmap=plt.cm.Blues)
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('Expected')
    plt.show()

In [None]:
def conf_mat_transform(y_true,y_pred) :
    conf_mat = confusion_matrix(y_true,y_pred)
    
    corresp = np.argmax(conf_mat, axis=0)
    # corresp = [0,1,2,4,3,6,5]
    print ("Correspondance des clusters : ", corresp)
    # y_pred_transform = np.apply_along_axis(correspond_fct, 1, y_pred)
    labels = pd.Series(y_true, name="y_true").to_frame()
    labels['y_pred'] = y_pred
    labels['y_pred_transform'] = labels['y_pred'].apply(lambda x : corresp[x])
     
    
    return labels['y_pred_transform']

# 1 Modélisation

## 1.1 PCA

In [None]:
df_pca=data.drop('is_genuine',axis=1)
n_comp=df_pca.shape[1]
columns_name=df_pca.columns.tolist()
X=df_pca.values

std_scale = StandardScaler().fit(X)
X_scaled = std_scale.transform(X)
pca=PCA(n_components=n_comp)
pca.fit(X_scaled)
pcs = pca.components_

print(f'pca variance explained ratio {pca.explained_variance_ratio_.cumsum()}')

plt.cla()
plt.clf()
display_circles(pcs, n_comp, pca, [(0,1),(2,3),(4,5),(6,7)], labels = np.array(columns_name))

In [None]:
num_pc = pca.n_features_
pc_list = ["PC"+str(i) for i in list(range(1, num_pc+1))]
pcs_df = pd.DataFrame.from_dict(dict(zip(pc_list, pcs)))
pcs_df['variable'] = df_pca.columns.values
pcs_df = pcs_df.set_index('variable')
pcs_df.style.background_gradient('coolwarm',0.1)

In [None]:
X_projected=pca.fit_transform(X_scaled)
df_projected=pd.DataFrame(X_projected,index=data.index,columns=["F"+str(i+1) for i in range(n_comp)])

In [None]:
tmp=df_projected.reset_index()
fig=go.Figure()
fig=px.scatter(tmp,
    tmp['F1'],
    tmp['F2'],
    color=data['is_genuine'].astype('str'),
    width=800, height=800,
    )
fig.update_layout(title='Visualisation de projection par ACP des billets de banque sur les 2 premiers dimensions')
fig.update_layout(xaxis_range=[tmp['F1'].min()-1,tmp['F1'].max()+1])
fig.update_layout(yaxis_range=[tmp['F2'].min()-1,tmp['F2'].max()+1])
fig.add_traces(go.Scatter(
                  x=[0,0],
                  y=[-100,100],
                  mode='lines',
                  line=go.scatter.Line(color="gray"),
                  showlegend=False
))
fig.add_traces(go.Scatter(
                  x=[-100,100],
                  y=[0,0],
                  mode='lines',
                  line=go.scatter.Line(color="gray"),
                  showlegend=False
))

# fig.update_layout(
#     font_family="Courier New",
#     font_color="blue",
#     title_font_family="Times New Roman",
#     title_font_color="red",
#     legend_title_font_color="green"
# )
fig.show()

## 1.2 LogisticRegression

In [None]:
X=data.drop('is_genuine',axis=1)
y=data['is_genuine']
X_train, X_test,y_train, y_test= train_test_split(X,y,test_size=0.20,random_state=12)

In [None]:
# params = {'Cs' : 20,
#           'cv':5,
#           'penalty' : ['l1', 'l2','elastic'],
#           'solver' : ['saga'],
#           'n_jobs':[-1],
#           'verbose':[2],
#           'random_stae':[12]
#           }

In [None]:
clf = LogisticRegressionCV(Cs=20,cv=5,solver="liblinear",random_state=12).fit(X_train, y_train)

In [None]:
df_metrics = pd.DataFrame(columns=["Accuracy", "Precision", "Recall", "F1_score","Fbeta_score"], index=["Logistic regression"])
df_metrics["Accuracy"]= round(accuracy_score(y_test, clf.predict(X_test))*100,2)
df_metrics["Precision"]= round(precision_score(y_test, clf.predict(X_test))*100,2)
df_metrics["Recall"]= round(recall_score(y_test, clf.predict(X_test))*100,2)
df_metrics["F1_score"] = round(f1_score(y_test, clf.predict(X_test))*100,2)
df_metrics["Fbeta_score"] = round(fbeta_score(y_test, clf.predict(X_test),beta=0.5)*100,2)

df_metrics

In [None]:
aff_confusion_matrix(y,clf.predict(X))

In [None]:
clf.predict(test_data.iloc[:,:-1])

In [None]:
clf.predict_proba(test_data.iloc[:,:-1])

In [None]:
# path=url_file/'lr_model.pickle'
# with open(path,'wb') as f:
#   pickle.dump(clf,f,protocol=pickle.HIGHEST_PROTOCOL)

## 1.2 Kmeans

### 1.2.1 Recherche d'un k théorique

In [None]:
X=data.drop('is_genuine',axis=1)

In [None]:
# Instantiate the clustering model and visualizer
model = KMeans(random_state=12)
visualizer = KElbowVisualizer(model, k=(2,7),random_state=12,size=(500,400))
visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show() 

In [None]:
visualizer = KElbowVisualizer(model,metric='calinski_harabasz',k=(2,7),random_state=12,timings=False,locate_elbow=False,size=(500,400))
visualizer.fit(X)        # Fit the data to the visualizer
visualizer.show() 

In [None]:
visualizer = KElbowVisualizer(model,metric='silhouette',k=(2,7),random_state=12,timings=False,locate_elbow=False,size=(500,400))
visualizer.fit(X)
visualizer.show() 

### 1.2.2 K=2 est imposé par la contrainte du projet

In [None]:
model = KMeans(n_clusters=2,init='k-means++',n_init=100,algorithm="auto",random_state=12).fit(X)

In [None]:
model.inertia_

In [None]:
model.cluster_centers_

In [None]:
df_result= pd.DataFrame(model.labels_, columns = ['y_predicted'])
df_result.value_counts()

In [None]:
data['is_genuine']

In [None]:
cls_labels_transform = conf_mat_transform(data['is_genuine'].values,df_result['y_predicted'])
conf_mat = confusion_matrix(data['is_genuine'].values, cls_labels_transform)

In [None]:
df_cm = pd.DataFrame(conf_mat, index = [label for label in [0,1]],
                  columns = [i for i in "01"])
plt.figure(figsize = (6,4))
sns.heatmap(df_cm, annot=True,  fmt="d", cmap="Blues")
plt.title("Matrice de confusion des true_cat vs pred_cat")

In [None]:
model.predict(test_data.iloc[:,:-1].values)

In [None]:
# path=url_file/'kmeans_model.pickle'
# with open(path,'wb') as f:
#   pickle.dump(model,f,protocol=pickle.HIGHEST_PROTOCOL)

# 3 test_snippets

In [None]:
%%capture
!pip install jupyter_dash
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

In [None]:
# Load Data
df = px.data.tips()
# Build App
app = JupyterDash(__name__)
app.layout = html.Div([
    html.H1("JupyterDash Demo"),
    dcc.Graph(id='graph'),
    html.Label([
        "genuine",
        dcc.Dropdown(
            id='genuine-dropdown', clearable=False,
            value='both', options=[
                {'label': c, 'value': c}
                for c in px.colors.named_colorscales()
            ])
    ]),
])
# Define callback to update graph
@app.callback(
    Output('graph', 'figure'),
    [Input("colorscale-dropdown", "value")]
)
def update_figure(colorscale):
    return px.scatter(
        df, x="total_bill", y="tip", color="size",
        color_continuous_scale=colorscale,
        render_mode="webgl", title="Tips",
    )
# Run app and display result inline in the notebook
app.run_server(mode='inline')

In [None]:
app = JupyterDash(__name__)

app.layout = dcc.RadioItems(['Both', 'Genuine','Counterfeit'], 'Both', inline=True)

app.run_server(mode='inline')