# Jean Philippe Vert, Franck Ferreboeuf

# Fonctions
---
Ce fichier contient toutes les fonctions utilisées et appelées dans le fichier main (pour les pré-traitements, etc...)

## Imports

In [2]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin, BiclusterMixin, ClusterMixin, ClassifierMixin
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, fbeta_score, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.svm import NuSVC, SVC
from sklearn.svm.libsvm import cross_validation, decision_function, fit, predict
from sklearn.tree import DecisionTreeClassifier

from mlxtend.preprocessing import DenseTransformer as DT

import re

---
## Pré-traitements
---

### Stop words associés au domaine du cinéma

In [9]:
stopwords_domaine = ["movie","film","actor","director","production","scenario","casting"]

### Suppresion des stopwords 

In [11]:
def supprimer_stopwords(avis, domaine=False): #indique si on utilise les stopwords du domaine
    sans_ponctuation = re.sub("[^a-zA-Z]", " ", avis)
    liste_mots = sans_ponctuation.lower().split() #permet de mettre en minuscules et séparer tous les mots
    if domaine:
        stopw = set(stopwords.words("english") + stopwords_domaine)
    else :
        stopw = set(stopwords.words("english"))
    
    liste_mots_sans_stopw = [w for w in liste_mots if not w in stopw] #conserve uniquement les mots non stopwords
    
    return( " ".join(liste_mots_sans_stopw))

### Suppresion de la ponctuation, tags html...

In [14]:
def supprimer_ponctuation(dataset, to_lower=False):
    liste_mots = []
    for i in dataset:
        i = re.sub("\S*@\S*\s", "", i) # Supprimer les emails
        i = re.sub("<(br|html|body|tr|td|th|table|div|span|a).*>", "", i) # Supprimer les tags html
        i = re.sub("[^a-zA-Z]", " ", i) # Supprimer ce qui n'est pas des lettres
        if to_lower:
            i = i.lower()
        liste_mots.append(i)
    return liste_mots

### Combinaisons de prétraitements

In [17]:
from bs4 import BeautifulSoup
def pretraitement_combine(avis, supprime_stopwords = False, domaine = True, stem_words = False):
    # Effectue plusieurs prétraitements (stopwords, stem words...)

    # Supprime le code HTML
    liste_mots = BeautifulSoup(avis, "lxml").get_text()
    
    # Met en minuscules et séparent les mots
    liste_mots = liste_mots.lower()

    # Corrige les mauvaises contractions
    liste_mots = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", liste_mots)
    liste_mots = re.sub(r"it's", " it is", liste_mots)
    liste_mots = re.sub(r"that's", " that is", liste_mots)
    liste_mots = re.sub(r"\'s", " 's", liste_mots)
    liste_mots = re.sub(r"\'ve", " have", liste_mots)
    liste_mots = re.sub(r"won't", " will not", liste_mots)
    liste_mots = re.sub(r"don't", " do not", liste_mots)
    liste_mots = re.sub(r"daren't", " dare not", liste_mots)
    liste_mots = re.sub(r"mustn't", " must not", liste_mots)
    liste_mots = re.sub(r"weren't", " were not", liste_mots)
    liste_mots = re.sub(r"wasn't", " was not", liste_mots)
    liste_mots = re.sub(r"isn't", " is not", liste_mots)
    liste_mots = re.sub(r"can't", " can not", liste_mots)
    liste_mots = re.sub(r"cannot", " can not", liste_mots)
    liste_mots = re.sub(r"n\'t", " not", liste_mots)
    liste_mots = re.sub(r"\'re", " are", liste_mots)
    liste_mots = re.sub(r"\'d", " would", liste_mots)
    liste_mots = re.sub(r"\'ll", " will", liste_mots)
    liste_mots = re.sub(r"!", " ! ", liste_mots)
    liste_mots = re.sub(r"\?", " ? ", liste_mots)
    liste_mots = re.sub(r"\s{2,}", " ", liste_mots)
    
    # Supprime les stopwords
    if supprime_stopwords:
        words = liste_mots.split()
        if domaine:
            stopw = set(stopwords.words("english") + stopwords_domaine)
        else:
            stopw = set(stopwords.words("english"))
        words = [w for w in words if not w in stopw]
        liste_mots = " ".join(words)
    
    # Effectue la stemmatisation
    if stem_words:
        words = liste_mots.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in words]
        liste_mots = " ".join(stemmed_words)
    
    return liste_mots

---
### Appliquer un prétraitement
---

#### Cette fonction permet d'appliquer un prétraitement à toute une liste d'avis

In [18]:
def applique_pretraitement(dataset, fonction_pretraitement):
    nb_avis = len(dataset)
    clean_avis = []

    #On applique la fonction choisie à tout le document
    print('Début prétraitement')
    for i in range(0, nb_avis):
        clean_avis.append(fonction_pretraitement(dataset[i]))
    print('Fin prétraitement')
    return clean_avis

---
#### Prétraitement combiné avec POSTagging, lemmatisation...
Permet d'appliquer différents prétraitements choisis en paramètres en appelant la fonction `dataset_preprocessing(ds, use_pos_tagger=True, use_lem=False, use_stem=False, use_stopwords=False)`.

In [30]:
from nltk import pos_tag
#from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

import nltk

# WordNet only cares about 5 parts of speech.
# The other parts of speech will be tagged as nouns.

part = {
    'N' : 'n',
    'V' : 'v',
    'J' : 'a',
    'S' : 's',
    'R' : 'r'
}

wnl = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def convert_tag(penn_tag):
    '''
    convert_tag() accepts the **first letter** of a Penn part-of-speech tag,
    then uses a dict lookup to convert it to the appropriate WordNet tag.
    '''
    if penn_tag in part.keys():
        return part[penn_tag]
    else:
        # other parts of speech will be tagged as nouns
        return 'n'


#Retourne pour un document unique ses PosTag / Lem / Stem
def document_preprocessing(element, use_pos_tagger=True, use_lem=False, use_stem=False, use_stopwords=False):
    # list of tuples [('token', 'tag'), ('token2', 'tag2')...]
    preprocessed = element.split(" ")
    if use_pos_tagger:
        preprocessed = pos_tag(nltk.word_tokenize(element)) # must tag in context
    else:
        preprocessed = [tuple([i]) for i in preprocessed]
    if use_lem:
        preprocessed = [preprocessed[k] + (wnl.lemmatize(preprocessed[k][0], convert_tag(preprocessed[k][1][0])), ) for k in range(len(preprocessed))]
    if use_stem:
        preprocessed = [preprocessed[k] + (stemmer.stem(preprocessed[k][0]), ) for k in range(len(preprocessed))]
    if use_stopwords:
        stops = set(stopwords.words("english"))
        preprocessed = [w for w in preprocessed if not w[0].lower() in stops]
    return preprocessed

def dataset_preprocessing(ds, use_pos_tagger=True, use_lem=True, use_stem=True, use_stopwords=True):
    print("Application de tous les prétraitements")
    new_ds = []
    waiting_text = "Preprocessing, "
    if use_pos_tagger:
        waiting_text += "POS Tagging, "
    if use_lem:
        waiting_text += "Lemmatization, "
    if use_stem:
        waiting_text += "Stemming, "
    if use_stopwords:
        waiting_text += "Removing Stopwords, "
    for i in range(0, len(ds)):
        if (i+1) % 500  == 0:
            print(waiting_text[:-2] + " de l'avis {:d} sur {:d}".format(i+1, len(ds)))
        new_ds.append(document_preprocessing(ds[i], use_pos_tagger, use_lem, use_stem, use_stopwords))
    return new_ds

# Crée un dataset traitable par les classifieurs sans les pos_tag non pertinents et lemmatisation/stemming
def create_text_dataset_without_postags(dataset_with_postags, remove_tags=[], transform_to_lem=True, transform_to_stem=True):
    datasize = len(dataset_with_postags[0][0])
    if datasize < 2:
        print("Warning: Votre dataset ne possède pas les données appropriées")
        return []
    dataset_without_postags = []
    for i in dataset_with_postags:
        removed_postag = [j for j in i if not j[1] in remove_tags]
        li = []
        if transform_to_lem and transform_to_stem:
            #Si le mot a été lemmatisé, on le prend, sinon on prend le mot avec stemming appliqué
            for k in removed_postag:
                if k[2] != k[0]:
                    li.append(k[2])
                else:
                    li.append(k[3])
        elif transform_to_lem and datasize >= 3:
            for k in removed_postag:
                li.append(k[2])
        elif transform_to_stem and datasize >= 4:
            for k in removed_postag:
                li.append(k[3])
        else:
            for k in removed_postag:
                li.append(k[0])
        dataset_without_postags.append(" ".join(li))
    return dataset_without_postags

---
### Récuperer le nom de la variable d'un classfieur (pour sauvegarder le classifieur)

In [19]:
import inspect
def retrieve_name(var):
    for fi in reversed(inspect.stack()):
        names = [var_name for var_name, var_val in fi.frame.f_locals.items() if var_val is var]
        if len(names) > 0:
            return names[0]

---
## Visualisation
---

### Wordcloud (nuage de mots utilisé pour les stopwords...)

In [26]:
def wordcloud_draw(data, color = 'black', maskPath="", width=2500, height=2000, filename="train"):
    mask = ""
    if maskPath != "":
        mask = np.array(Image.open(maskPath))
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    if maskPath != "":
        wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=width,
                      height=height,
                          mask=mask
                     ).generate(cleaned_word)
    else:
        wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=width,
                      height=height,
                     ).generate(cleaned_word)
    fig = plt.figure(1, figsize=(8.27, 11.69), dpi=250)
    filename = color + '-' + filename + '.png'
    plt.imshow(wordcloud)
    plt.axis('off')
    fig.tight_layout()
    fig.savefig("./visualisation/" + filename)
    plt.show()

### Affichage des Features

In [28]:
def print_feature(data_set, labels_set):
    # create an instance for tree feature selection
    tree_clf = ExtraTreesClassifier()

    # first create arrays holding input and output data

    # Vectorizing Train set
    cv = TfidfVectorizer(analyzer='word')
    x_train = cv.fit_transform(data_set)

    # Creating an object for Label Encoder and fitting on target strings
    le = LabelEncoder()
    y = le.fit_transform(labels_set)

    # fit the model
    tree_clf.fit(x_train, y)

    # Preparing variables
    importances = tree_clf.feature_importances_
    feature_names = cv.get_feature_names()
    feature_imp_dict = dict(zip(feature_names, importances))
    sorted_features = sorted(feature_imp_dict.items(), key=operator.itemgetter(1), reverse=True)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(20):
        print("feature %d : %s (%f)" % (indices[f], sorted_features[f][0], sorted_features[f][1]))

    # Plot the feature importances of the forest
    plt.figure(figsize = (20,20))
    plt.title("Feature importances")
    plt.bar(range(100), importances[indices[:100]],
           color="r", align="center")
    plt.xticks(range(100), sorted_features[:100], rotation=90)
    plt.xlim([-1, 100])
    plt.show()

    return()

---
## Pipelines
---

### Listes des pipelines

In [20]:
custom_pipelines = [
    [('tfvect', TfidfVectorizer()), 
     ('tft', TfidfTransformer()), 
     ('mnb', MultinomialNB())],
    [('cvect', CountVectorizer()), 
     ('tft', TfidfTransformer()), 
     ('mnb', MultinomialNB())],
    [('tfvect', TfidfVectorizer()), 
     ('reduce_dim', TruncatedSVD()), 
     ('gnb', GaussianNB())],
    [('tfvect', TfidfVectorizer()), 
     ('tft', TfidfTransformer()),
     ('reduce_dim', TruncatedSVD()), 
     ('gnb', GaussianNB())],
    [('tfvect', TfidfVectorizer()),
     ('knc', KNeighborsClassifier()),],
    [('tfvect', TfidfVectorizer()), 
     ('tft', TfidfTransformer()), 
     ('sgdc', SGDClassifier())]
]

### Paramètres des pipelines 

In [21]:
pipeline_parameters = [
    {
        "tft__use_idf": (True, False),
        "tfvect__ngram_range": [(1,1),(1,2),(1,3)],
        "tfvect__smooth_idf": (True, False),
        "tfvect__sublinear_tf": (True, False),
        
    }, 
    {
        "tft__use_idf": (True, False),
        "cvect__ngram_range": [(1,1),(1,2), (1,3)],
    },
    {
        "reduce_dim__n_components":(5,),
        "reduce_dim__n_iter": (7,),
        "reduce_dim__random_state" : (42,),
    }, 
    {
        "reduce_dim__n_components":(5,),
        "reduce_dim__n_iter": (7,),
        "reduce_dim__random_state" : (42,),
        "tft__use_idf": (True, False),
    }, 
    {
        "knc__n_neighbors": (2,3,5),
        "knc__weights": ("uniform", "distance"),
        "knc__metric": ("euclidean", "minkowski"),
        
    }, 
    {
        "tft__use_idf": (True, False),
        "sgdc__max_iter": (500,1000),
        "sgdc__alpha": (0.001, 0.0001, 0.00001),
    }, 
]

---
## Résultats du modèle
---

### Afficher les résultats

In [29]:
def report_results(pred, pred_proba, y):    
    roc_auc = roc_auc_score(y, pred_proba)
    avr_prec = average_precision_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    fbeta_05 = fbeta_score(y, pred, 0.5)
    fbeta_2 = fbeta_score(y, pred, 2.0)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    conf = confusion_matrix(y, pred)
    
    result = {
        'roc_auc': roc_auc*100, 
        'average_precision': avr_prec*100,
        'f1': f1*100, 
        'f05': fbeta_05*100,
        'f2': fbeta_2*100,
        'acc': acc*100, 
        'precision': prec*100, 
        'recall': rec*100, 
        'confusion_matrix': conf
    }
    return result

def prettify_output_results(res):
    st = ""
    st += "Accuracy : \t" + str(res['acc']) + "\n"
    st += "Precision : \t" + str(res['precision']) + "\n"
    st += "Recall : \t\t" + str(res['recall']) + "\n"
    st += "F1 : \t\t" + str(res['f1']) + "\n"
    st += "F0.5 : \t\t" + str(res['f05']) + "\n"
    st += "F2 : \t\t" + str(res['f2']) + "\n"
    st += "Average precision: \t" + str(res['average_precision']) + "\n"
    st += "ROC AUC score: \t\t" + str(res['roc_auc']) + "\n"
    st += "\nConfusion Matrix : \n" + str(res['confusion_matrix']) + "\n"
    return st

### Enregistrer les résultats

In [25]:
def write_results_to_disk(model, results):
    choix = input("Voulez vous sauvegarder les résultats ? (y/n) ")
    if 'y' not in choix:
        print("Pas de sauvegarde")
    else:
        import os
        def numbers( path ):
            for filename in os.listdir(path):
                name = filename.split(".")
                yield int(name[0][6:])
        count = max( numbers( './results_report/' ) )
        
        model_txt = "\n".join([" ".join(re.split("\s+", str(i))) for i in model.steps])
        pretraitements = input("Quels sont les prétraitements sur les données?")
        remarques = input("Avez vous des commentaires ?")
        count += 1
        filename = "./results_report/report" + str(count) + ".txt"
        report = open(filename, "w")
        report.write("Prétraitements: " + pretraitements + "\n\n")
        report.write("Notes : "+ remarques + "\n\n")
        report.write(model_txt+"\n\n")
        report.write(prettify_output_results(results))
        report.close()
        print("Report created as report" + str(count) + ".txt for model with " + str(results["acc"]*100) + "% accuracy")