In [43]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
from datetime import date
today = date.today()
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sklearn
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
import string
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pyLDAvis
import pyLDAvis.sklearn
import gensim
from itertools import combinations
from sklearn.metrics import accuracy_score, f1_score, hamming_loss,jaccard_score

In [5]:
#Lecture du Fichier
df = pd.read_csv('posts_with_tags_more_frequent.csv', sep = ',', encoding='UTF-8')

In [44]:
#Fonction de preprocessing
stop_words = set(stopwords.words('English'))
def removeStopWord(Word_list):
    filtered_Word_list = Word_list[:] #make a copy of the Word_list
    for Word in Word_list: # iterate over Word_list
        if Word.lower() in stop_words: 
            filtered_Word_list.remove(Word) # remove Word from filtered_Word_list if it is a stopword
    return filtered_Word_list

# Instantiate stemmers
porter = PorterStemmer()
#Fonction de lemmatisation
def lemmatisation(Word_list):
    Words = Word_list[:] #make a copy of the Word_list
    Words = [porter.stem(word) for word in Words]
    return Words

#Fonction qui supprime le tag si celui ci n'appartient au TOP        
def removeNotTop100(Word_list):
    filtered_Word_list = Word_list[:] #make a copy of the Word_list
    for Word in Word_list: # iterate over Word_list
        if Word not in Top100: 
            filtered_Word_list.remove(Word) # remove Word from filtered_Word_list if it is a stopword
    if len(filtered_Word_list) == 0:
        return None
    else:
        return filtered_Word_list
    
#Fonction qui supprime le mot si seulement du numérique        
def removeOnlyNumeric(Word_list):
    word_list = Word_list[:] #make a copy of the Word_list
    for Word in Word_list: # iterate over Word_list
        if Word.isnumeric(): 
            word_list.remove(Word) # remove Word from filtered_Word_list if it is a stopword
    return word_list
    
#Fonction qui traite le C#
def processCSharp(Word_list):
    word_list = Word_list[:] #make a copy of the Word_list
    for index, value in enumerate(word_list):
        if value == '#':
            word_list.remove(value) # remove Word from filtered_Word_list if it is a stopword
            word_list[index-1] = 'c#'
    return word_list

# Fonction qui ne fait rien pour exploiter le pré traitement que nous avons réalisé
def dummy(doc):
    return doc

#Fonction qui renvoi différents scores
def print_score(y_test, y_pred):
    print("Hamming loss : {}".format(hamming_loss(y_test, y_pred)))
    print('Subset Accuracy : ', accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    print('F1-score : ', f1_score(y_test, y_pred, average='micro'))
    print('Jaccard : ', jaccard_score(y_test, y_pred, average='micro'))
    #Revoir les indicateurs avec Jacard

In [8]:
#Préprocessing du champ Tittle et Body
df['TitleBody'] = df['TitleBody'].map(lambda x: BeautifulSoup(x, "html.parser").get_text())
df['TitleBody'] = df['TitleBody'].map(lambda x: x.replace('\n', ' '))
df['TitleBody'] = df['TitleBody'].map(lambda x: x.replace(':', ''))

punct = string.punctuation
for c in punct:
    if c != '#':
        df['TitleBody'] = df['TitleBody'].map(lambda x: x.replace(c, ''))

#Création des nuages de mots avec NLTK
df['TitleBody_final'] = df.apply(lambda row: nltk.word_tokenize(row['TitleBody'],language='english'), axis=1)

#Supression des StopWord
df['TitleBody_final'] = df.apply(lambda row:removeStopWord(row['TitleBody_final']), axis=1)

#Lemmatisation
#df['TitleBody_final'] = df.apply(lambda row:lemmatisation(row['TitleBody_final']), axis=1)

#Traitement du C#
df['TitleBody_final'] = df.apply(lambda row:processCSharp(row['TitleBody_final']), axis=1)

#Suppressin des numériques
df['TitleBody_final'] = df.apply(lambda row:removeOnlyNumeric(row['TitleBody_final']), axis=1)

In [9]:
df.head(5)

Unnamed: 0,Id,BODY,Title,Tags,CreationDate,TitleBody,processed_tags_final,processed_body_final,processed_title_final,TitleBody_final
0,12051,<p>if i inherit from a base class and want to ...,calling the base constructor in c#,<c#><.net><inheritance><constructor>,2008-08-15 07:39:23,calling the base constructor in c# if i inheri...,"['c#', '.net']","['inherit', 'base', 'class', 'want', 'pass', '...","['calling', 'base', 'constructor', 'c#']","[calling, base, constructor, c#, inherit, base..."
1,17319422,<p>i am using pycharm on windows and want to c...,how do i set the maximum line length in pycharm?,<python><pycharm><pep8>,2013-06-26 12:00:31,how do i set the maximum line length in pychar...,['python'],"['using', 'pycharm', 'windows', 'want', 'chang...","['set', 'maximum', 'line', 'length', 'pycharm']","[set, maximum, line, length, pycharm, using, p..."
2,32664,<p>can anyone tell me if there is a way with g...,is there a constraint that restricts my generi...,<c#><generics><constraints>,2008-08-28 16:04:49,is there a constraint that restricts my generi...,"['c#', 'generics']","['anyone', 'tell', 'way', 'generics', 'limit',...","['constraint', 'restricts', 'generic', 'method...","[constraint, restricts, generic, method, numer..."
3,8763125,<p>i would like to get the keys of a javascrip...,get array of object's keys,<javascript><ecmascript-5>,2012-01-06 19:12:38,get array of objects keys i would like to get ...,['javascript'],"['would', 'like', 'get', 'keys', 'javascript',...","['get', 'array', 'objects', 'keys']","[get, array, objects, keys, would, like, get, ..."
4,33923,"<p>whilst starting to learn lisp, i've come ac...",what is tail recursion?,<algorithm><language-agnostic><functional-prog...,2008-08-29 03:48:03,what is tail recursion whilst starting to lear...,"['algorithm', 'language-agnostic']","['whilst', 'starting', 'learn', 'lisp', 'ive',...","['tail', 'recursion']","[tail, recursion, whilst, starting, learn, lis..."


In [36]:
#Utilisation de la librairie MultilabelBinarize pour créer un matrice de Tags
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
tags_mlb = mlb.fit_transform(df['processed_tags_final'])

## Création du jeu d'entrainement et du jeu de test 

In [12]:
from sklearn.model_selection import train_test_split

X = df['TitleBody_final']
y = tags_mlb

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=.25)

## Vectorisation avec TF-IDF

In [37]:
vectorizer = TfidfVectorizer(tokenizer=dummy,preprocessor=dummy,
                            max_features=12000)
vectorizer.fit(X_train)
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)



In [38]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

nb_clf = OneVsRestClassifier(MultinomialNB(fit_prior=True, 
                                           class_prior=None))
nb_clf.fit(X_train_vec, y_train)



OneVsRestClassifier(estimator=MultinomialNB())

In [45]:
y_pred_nb = nb_clf.predict(X_test_vec)
print_score(y_test, y_pred_nb)

Hamming loss : 0.13770031565971455
Subset Accuracy :  0.10203005536514632
F1-score :  0.7489444624878208
Jaccard :  0.59865005192108
