In [48]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import matplotlib.pyplot as plt
from datetime import date
today = date.today()
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [25]:
#Lecture du Fichier
df = pd.read_csv('QueryResults.csv', sep = ',', encoding='UTF-8')

#Lecture du Fichier de tags
df_tags1 = pd.read_csv('Tags1.csv', sep = ',', encoding='UTF-8')
df_tags2 = pd.read_csv('Tags2.csv', sep = ',', encoding='UTF-8')

df_tags_full = pd.concat([df_tags1, df_tags2], axis=0)



In [26]:
df_tags_full.shape

(62193, 1)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16413 entries, 0 to 16412
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            16413 non-null  int64 
 1   BODY          16413 non-null  object
 2   Title         16413 non-null  object
 3   Tags          16413 non-null  object
 4   CreationDate  16413 non-null  object
dtypes: int64(1), object(4)
memory usage: 641.3+ KB


In [17]:
df.isna().mean()

Id              0.0
BODY            0.0
Title           0.0
Tags            0.0
CreationDate    0.0
dtype: float64

In [17]:
#Téléchargement des packages nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Traitement du champs Tags
* On détermine les 100 Tags les plus utlisés
* On supprime tous les Tags qui ne font pas partis de ce TOP 100
* Supression des lignes Tags

In [102]:
#Expression régulière pour prendre uniquement les valeurs entre <>
tokenizer = RegexpTokenizer('(?<=\<).*?(?=\>)')

#Création de la colonne avec ces valeurs
df['tags_words'] = df.apply(lambda row: tokenizer.tokenize(row['Tags']), axis=1)

In [230]:
#Comptage de l'utlisation des mots
Top_Word = nltk.FreqDist()
Top100 = []
for x in df['tags_words']:
    Top_Word += nltk.FreqDist(x)

for i in Top_Word.most_common()[0:100]:
    Top100.append(i[0])

In [242]:
#Fonction qui supprime le tag si celui ci n'appartient au TOP        
def removeNotTop100(Word_list):
    filtered_Word_list = Word_list[:] #make a copy of the Word_list
    for Word in Word_list: # iterate over Word_list
        if Word not in Top100: 
            filtered_Word_list.remove(Word) # remove Word from filtered_Word_list if it is a stopword
    if len(filtered_Word_list) == 0:
        return None
    else:
        return filtered_Word_list

In [244]:
#Suppression des Tags qui ne sont pas dans le TOP
df['processed_tags_final'] = df.apply(lambda row:removeNotTop100(row['tags_words']), axis=1)

In [254]:
#Suppression des lignes sans Tags
df = df.dropna()

# Préprocesing du champs BODY
* Utilisation de BEAUTIFULSOUP pour le traitement HTML
* Puis remplacement des retours chariots et des :
* Création du nuage de mots
* Supression des StopWord Anglais

In [49]:
#Préprocessing du champ BODY
df['processed_body'] = df['BODY'].map(lambda x: BeautifulSoup(x, "html.parser").get_text())
df['processed_body'] = df['processed_body'].map(lambda x: x.replace('\n', ' '))
df['processed_body'] = df['processed_body'].map(lambda x: x.replace(':', ''))

#Création des nuages de mots avec NLTK
df['processed_body2'] = df.apply(lambda row: nltk.word_tokenize(row['processed_body'],language='english'), axis=1)

#Supression des StopWord
df['processed_body_final'] = df.apply(lambda row:removeStopWord(row['processed_body2']), axis=1)

In [50]:
df.head(2)

Unnamed: 0,Id,BODY,Title,Tags,CreationDate,processed_body,processed_body2,processed_body_final
0,12051,<p>If I inherit from a base class and want to ...,Calling the base constructor in C#,<c#><.net><inheritance><constructor>,2008-08-15 07:39:23,If I inherit from a base class and want to pas...,"[If, I, inherit, from, a, base, class, and, wa...","[inherit, base, class, want, pass, something, ..."
1,17319422,<p>I am using PyCharm on Windows and want to c...,How do I set the maximum line length in PyCharm?,<python><pycharm><pep8>,2013-06-26 12:00:31,I am using PyCharm on Windows and want to chan...,"[I, am, using, PyCharm, on, Windows, and, want...","[using, PyCharm, Windows, want, change, settin..."


# Fonction de suppression des StopWords Anglais

In [31]:
stop_words = set(stopwords.words('English'))
def removeStopWord(Word_list):
    filtered_Word_list = Word_list[:] #make a copy of the Word_list
    for Word in Word_list: # iterate over Word_list
        if Word.lower() in stop_words: 
            filtered_Word_list.remove(Word) # remove Word from filtered_Word_list if it is a stopword
    return filtered_Word_list

# Préprocesing du champs Titre

* Remplacement des retours chariots et des :
* Création du nuage de mots
* Supression des StopWord Anglais

In [245]:
#Préprocessing du champ Title
df['processed_title'] = df['Title'].map(lambda x: x.replace('\n', ' '))
df['processed_title'] = df['processed_title'].map(lambda x: x.replace(':', ''))

#Création des nuages de mots avec NLTK
df['processed_title2'] = df.apply(lambda row: nltk.word_tokenize(row['processed_title'],language='english'), axis=1)

#Supression des StopWord
df['processed_title_final'] = df.apply(lambda row:removeStopWord(row['processed_title2']), axis=1)

In [248]:
#Suppression des colonnes inutiles
df.drop({'processed_body','processed_body2','tags_words','processed_title','processed_title2'}, axis=1, inplace=True)

In [295]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [296]:
from nltk.stem.wordnet import WordNetLemmatizer
#lemmatisation
df['processed_title_final2'] = df.apply(lambda row:lemmatize_text(row['processed_title_final']), axis=1)


In [297]:
df.head(10)

Unnamed: 0,Id,BODY,Title,Tags,CreationDate,processed_body_final,processed_tags_final,processed_title_final,processed_title_final2,processed_body_final2
0,12051,<p>If I inherit from a base class and want to ...,Calling the base constructor in C#,<c#><.net><inheritance><constructor>,2008-08-15 07:39:23,"[inherit, base, class, want, pass, something, ...","[c#, .net]","[Calling, base, constructor, C, #]","[Calling, base, constructor, C, #]",<WordNetLemmatizer>
1,17319422,<p>I am using PyCharm on Windows and want to c...,How do I set the maximum line length in PyCharm?,<python><pycharm><pep8>,2013-06-26 12:00:31,"[using, PyCharm, Windows, want, change, settin...",[python],"[set, maximum, line, length, PyCharm, ?]","[set, maximum, line, length, PyCharm, ?]",<WordNetLemmatizer>
2,32664,<p>Can anyone tell me if there is a way with g...,Is there a constraint that restricts my generi...,<c#><generics><constraints>,2008-08-28 16:04:49,"[anyone, tell, way, generics, limit, generic, ...","[c#, generics]","[constraint, restricts, generic, method, numer...","[constraint, restricts, generic, method, numer...",<WordNetLemmatizer>
3,8763125,<p>I would like to get the keys of a JavaScrip...,Get array of object's keys,<javascript><ecmascript-5>,2012-01-06 19:12:38,"[would, like, get, keys, JavaScript, object, a...",[javascript],"[Get, array, object, 's, keys]","[Get, array, object, 's, key]",<WordNetLemmatizer>
5,33923,"<p>Whilst starting to learn lisp, I've come ac...",What is tail recursion?,<algorithm><language-agnostic><functional-prog...,2008-08-29 03:48:03,"[Whilst, starting, learn, lisp, ,, 've, come, ...","[algorithm, language-agnostic]","[tail, recursion, ?]","[tail, recursion, ?]",<WordNetLemmatizer>
6,33969,<p>We're experimenting with various ways to th...,Best way to implement request throttling in AS...,<asp.net-mvc><throttling>,2008-08-29 04:50:50,"['re, experimenting, various, ways, throttle, ...",[asp.net-mvc],"[Best, way, implement, request, throttling, AS...","[Best, way, implement, request, throttling, AS...",<WordNetLemmatizer>
7,33978,<p>How would you go about finding out how much...,Find out how much memory is being used by an o...,<python><performance><memory-profiling>,2008-08-29 04:59:31,"[would, go, finding, much, memory, used, objec...","[python, performance]","[Find, much, memory, used, object, Python]","[Find, much, memory, used, object, Python]",<WordNetLemmatizer>
8,33207,<p>What frameworks exist to unit test Objectiv...,What is the best way to unit test Objective-C ...,<objective-c><cocoa><unit-testing><xcode>,2008-08-28 19:41:30,"[frameworks, exist, unit, test, Objective-C, c...","[objective-c, unit-testing, xcode]","[best, way, unit, test, Objective-C, code, ?]","[best, way, unit, test, Objective-C, code, ?]",<WordNetLemmatizer>
9,32899,<p>I have some kind of test data and want to c...,How do you generate dynamic (parameterized) un...,<python><unit-testing><parameterized-unit-test>,2008-08-28 17:49:02,"[kind, test, data, want, create, unit, test, i...","[python, unit-testing]","[generate, dynamic, (, parameterized, ), unit,...","[generate, dynamic, (, parameterized, ), unit,...",<WordNetLemmatizer>
10,33746,<p>At work we are being asked to create XML fi...,XML attribute vs XML element,<xml><xsd>,2008-08-29 01:15:52,"[work, asked, create, XML, files, pass, data, ...",[xml],"[XML, attribute, vs, XML, element]","[XML, attribute, v, XML, element]",<WordNetLemmatizer>


# Création du jeu de test et d'entrainement

In [256]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df,test_size=0.2,random_state=42)

In [280]:
from sklearn.datasets import make_multilabel_classification
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score




mlp1 = MLPClassifier(hidden_layer_sizes=(200, ), max_iter=200, random_state=1)
mlp1.fit(df_train.loc[:,{'processed_title_final','processed_body_final'}].to_numpy(), df_train.loc[:,'processed_tags_final'].to_numpy())
y_pred1 = mlp1.predict(df_test.loc['processed_title_final','processed_body_final'].to_numpy())

#mlp2 = MLPClassifier(hidden_layer_sizes=(200, ), max_iter=200, random_state=1)
#mlp2.partial_fit(df_train.loc[:,{'processed_title_final','processed_body_final'}],df_train.loc[:,'processed_tags_final'], classes=list(range(df_train.loc[:,'processed_tags_final'].shape[1])))
#y_pred2 = mlp2.predict(x_test)

print('Using `fit`:')
print(accuracy_score(y_test, y_pred1))

#print('Using `partial_fit`:')
#print(accuracy_score(y_test, y_pred2))

ValueError: setting an array element with a sequence.

In [268]:
df_train.loc[:,'processed_tags_final'].shape[1]

IndexError: tuple index out of range

In [270]:
X, y = make_multilabel_classification(n_samples=1000, n_features=10,
                                      n_classes=5, n_labels=2,
                                      allow_unlabeled=False, random_state=1)

In [275]:
X

array([[ 2.,  6.,  2., ...,  9.,  4., 13.],
       [ 4.,  8.,  6., ...,  8.,  2.,  3.],
       [ 4.,  3.,  2., ...,  2.,  5., 11.],
       ...,
       [ 1.,  6.,  5., ..., 10.,  8., 10.],
       [ 3.,  5.,  3., ...,  3.,  3.,  5.],
       [ 7.,  6.,  1., ...,  8.,  5.,  3.]])