In [None]:
from time import time
from datetime import datetime
import pickle 

#Analytic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#ML
#ML.Vectorize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#ML.Selection
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
#ML.Reduction
from sklearn.decomposition import TruncatedSVD 
from sklearn.decomposition import NMF
from sklearn.decomposition import KernelPCA
#ML.Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#ML.Evaluate
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn import metrics
#Optimisation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#UserModule
import sys, os
from pathlib import Path
# @todo = refac later to the right way
sys.path.append(str(Path(os.path.dirname(os.path.abspath(__file__))).parent)+'\\LoveSence.ML.Communs')
import AccessCorpus as corpus
import PersistenceProvider as db_provider
import CleanningCorpus as cleanning

def echantilloner_donnees(corpus):
    # Split the data into train & test sets:
    X = corpus['clean_message']
    y = corpus['alabel']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 42)
    return (X_train, X_test, y_train, y_test)

def definir_experiences():
    #Vectorisation
    tfidf = TfidfVectorizer(stop_words='english', use_idf=True)
    tf = CountVectorizer(stop_words='english')
    ngram3 = CountVectorizer(analyzer='char', ngram_range=(3, 3))

    #Sélection des caracteristiques
    ki2 = SelectKBest(chi2)
    mic = SelectKBest(mutual_info_classif) 

    #Réduction des caracteristiques
    nmf = NMF() # Factorisation par matrices non-négatives
    lsa = TruncatedSVD()# latent semantic analysis
    ipca = KernelPCA() # Analyse en composantes principales

    #modèle standard
    mnb = MultinomialNB() # naive baye
    svm = SVC(kernel='linear')  # svm
    rf = RandomForestClassifier(n_estimators = 100) # random forest
    kn = KNeighborsClassifier(n_neighbors=3) # KNeighbors

    #Expériences
    return {'vectoriser': [('tf',tf), ('tfidf',tfidf)],
            'selectinner': [('mic',mic), ('ki2',ki2)],
            'reduire': [('nmf',nmf), ('lsa',lsa)],
            'modeliser': [('kn',kn), ('mnb',mnb)]}
    #return {'vectoriser': [('tf',tf), ('tfidf',tfidf), ('ngram3',ngram3)],
    #        'selectinner': [('mic',mic), ('ki2',ki2)],
    #        'reduire': [('nmf',nmf), ('lsa',lsa), ('ipca',ipca)],
    #        'modeliser': [('kn',kn), ('mnb',mnb), ('rf',rf), ('svm',svm)]}
    #return {'vectoriser': [('tf',tf)],
    #        'selectinner': [('ki2',ki2)],
    #        'reduire': [('ipca',ipca)],
    #        'modeliser': [('svm',svm)]}

def conduire_experimentation(donnees_experiences, experiences):  
    X_train, X_test = donnees_experiences[0], donnees_experiences[1]
    y_train, y_test = donnees_experiences[2], donnees_experiences[3]

    vectoriser_methodes = experiences['vectoriser']
    selectinner_methodes = experiences['selectinner']
    reduire_methodes = experiences['reduire']
    modeliser_methodes = experiences['modeliser']

    resultats = []

    for v in vectoriser_methodes:
        for s in selectinner_methodes:
            for r in reduire_methodes:
                for m in modeliser_methodes:
                    resultats.append(conduire_experience(X_train, y_train, X_test, y_test, v, s, r, m))
    return resultats

def conduire_experience(X_train, y_train, X_test, y_test, vectoriser, selectionner, reduire,  modeliser):
    score, n_components, k, t_training, t_test = -1, 0, 0, 0, 0
    experience_impossible = False
    erreur_experience = ''
    matrice_confusion = []
    ml_experience, autres_metriques = {}, {}
    methods_pickle, model_pickle, matrice_confusion_pickle = None, None, None
    code_experience = vectoriser[0] + '_' + selectionner[0] + '_' + reduire[0] + '_' + modeliser[0]
    print(str(datetime.now()) + ' Expérience:{0} - {1} {2} {3} {4}'.format(code_experience, vectoriser[0], selectionner[0], reduire[0], modeliser[0]))
                    
    try:
        t0 = time()
        model = construire_modele(X_train, y_train, vectoriser[1], selectionner[1], reduire[1], modeliser[1])
        t_training = time() - t0
    except Exception as e:
        erreur_experience = str(e)
        experience_impossible = True
        pass
                    
    if (experience_impossible == False):
        t0 = time()
        metriques_evalusation = evaluer_experience(model,  X_test, y_test)
        t_test = time() - t0
        score = metriques_evalusation['accuracy_score']
        matrice_confusion_pickle = pickle.dumps(metriques_evalusation['confusion_matrix'])
        autres_metriques = metriques_evalusation['classification_report']
        k = model.best_params_['sel__k']
        n_components = model.best_params_['red__n_components']
        methods_pickle = pickle.dumps({
                'vectorizer' : vectoriser[1],
                'selector' : selectionner[1],
                'reductor' : reduire[1],
                'classifier' : modeliser[1]
            })  
        model_pickle = pickle.dumps(model)

        ml_experience = {'date_experience': datetime.now(), 'type': 'I', 'methods_pickle': methods_pickle, 'code_experience' :code_experience,
           'score':score , 'matrice_confusion_pickle': matrice_confusion_pickle, 'autres_metriques' :  autres_metriques,
           'n_components': n_components, 'k': k, 'erreur_experience': erreur_experience, 'time_training': t_training , 'time_test' : t_test, 'model_pickle' : model_pickle}
        
        #mongo_db = db_provider.MongoDB('experiences').ajouter_document(ml_experience)
    return ml_experience

def construire_modele(X_train, y_train, vectoriser, selectionner, reduire, modeliser):
    pipe = Pipeline([('vec', vectoriser),
                     ('sel', selectionner),
                     ('red', reduire),
                     ('mod', modeliser)])
     
    #param_grid = {
    #   'sel__k': [500, 1000, 2500],
    #   'red__n_components': [10, 50, 100]}
    param_grid = {
       'sel__k': [1000],
       'red__n_components': [300]}
    search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1).fit(X_train, y_train)
    return search 

def evaluer_experience(model, X_test, y_test):
    y_predic = model.predict(X_test)
    scores_evalusation = {
        'confusion_matrix' : confusion_matrix(y_test, y_predic),
        'classification_report' : classification_report(y_test, y_predic, output_dict =True),
        'accuracy_score' :metrics.accuracy_score(y_test, y_predic)
       }

    return scores_evalusation

def resultats_experience_to_panda(array):
    resultats_panda = pd.DataFrame([], columns = ['date_experience', 'type', 'vectoriser' , 'selectinner', 'reduire', 'modeliser','code_experience' ,'score','n_components','k','erreur_experience','time_training', 'time_test'])
    resultats_panda.append(array, ignore_index=True)
    
    return resultats_panda

def editier_experimentation(resultats):
    result = resultats[resultats.score != -1]
    result = result.sort_values(by=['score'])
    scores = result['score'].values.tolist()
    experiences = result['code_experience'].values.tolist()
    fig, ax = plt.subplots()
    index = np.arange(len(experiences))
    plt.barh(index, scores, color= 'blue', align='center')
    ax.set_yticks(index)
    ax.set_yticklabels(experiences)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Score')
    ax.set_title("Score selon l'expérience")
    plt.show()

bilan_experimentation = conduire_experimentation(echantilloner_donnees(cleanning.clean(corpus.obtenir())), definir_experiences())
#mongo_db = db_provider.MongoDB('experiences').ajouter_documents(bilan_experimentation)
editier_experimentation(resultats_experience_to_panda({}))