# Import des librairies et des données nettoyées

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = pd.read_csv('clean_data.csv', index_col=[0])

In [3]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,TitleLength,FirstTag,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,2,c,directory listing scan directory folder file n...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,4,c#,return datatables wcf dotnet wcf service want ...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,4,.net,difference struct class dotnet difference stru...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,3,php,mysqli pdo pro place split use mysqli pdo stuf...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,4,javascript,scroll overflow divs javascript div use overfl...


In [4]:
data.drop(['FirstTag', 'TitleLength'], axis=1, inplace=True)

In [5]:
def replace_nan_with_empty_string(text):
    if type(text) != str:
        if math.isnan(text):
            return ' '
    else:
        return text

In [6]:
data['Title'] = data['Title'].apply(replace_nan_with_empty_string)
data['Body'] = data['Body'].apply(replace_nan_with_empty_string)
data['Code'] = data['Code'].apply(replace_nan_with_empty_string)

In [7]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,directory listing scan directory folder file n...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,return datatables wcf dotnet wcf service want ...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,difference struct class dotnet difference stru...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,mysqli pdo pro place split use mysqli pdo stuf...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,scroll overflow divs javascript div use overfl...


# Approche non supervisée : Latent Dirichlet Allocation

In [8]:
vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(data['FullPost'])
feature_names = vectorizer.get_feature_names()



In [None]:
lda_model = LatentDirichletAllocation(n_components=100,
                                      max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      n_jobs=4,
                                      random_state=0).fit(vectorized_text)

In [None]:
#### REVOIR CETTE PARTIE --> UTILISATION DE COUNT_VECTORIZER ET TF-IDF ######

In [None]:
def sort_coo(coo_matrix):
    """Sort the values in the TFIDF matrix in descending order"""
    tuples = zip(coo_matrix.col, coo_matrix.data) # FULL POST ?????????????
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
  
# Extract the top n words from each topic  
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results
        
def list_topics(model, feature_names, no_top_words):
    """Build a dictionary of topics' main features"""
    topic_dic={}
    for idx, topic in enumerate(model.components_):
        topic_dic[idx]=" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dic

In [None]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=0.001)
tfidf_text = tfidf_vectorizer.fit_transform(data['FullPost'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
topics = list_topics(lda_model, feature_names, 150)

tags = set()

for i in range(len(topics)):
    vector=vectorizer.transform([topics[i]])
    sorted_items=sort_coo(vector.tocoo())
    tags = tags.union(set(k for k in extract_topn_from_vector(feature_names,sorted_items,20)))

In [None]:
tag_vectorizer= CountVectorizer()
tag_CV=tag_vectorizer.fit_transform(data['Tags'])
tag_names=tag_vectorizer.get_feature_names()

In [None]:
print("\nNumber of Extracted Tags:")
print(len(tags))

print("\nTags found in both the extracted tags and the tag column of the dataset:")

cross_tags= [tag for tag in tag_names if tag in tags]
print(len(cross_tags))
print(sorted(cross_tags))

In [None]:
wordcloud = WordCloud(mode="RGBA", background_color=None, max_words=50)

fig = plt.figure(figsize=(20,100))
fig.subplots_adjust(hspace=0.1, wspace=0.001)

topic = 0
for i in range(25):
    for j in range(4):
        freq={}
        for k, l in enumerate(lda_model.components_[topic]):
            freq[feature_names[k]]=l
        wordcloud.generate_from_frequencies(freq)
        plt.subplot2grid((25,4), (i,j))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        topic+=1
plt.show()

### Test de suggestion de tags sur un Post pris au hasard

In [None]:
from random import randint

random_id = randint(0, data['FullPost'].shape[0] - 1)
data.iloc[[random_id]]

In [None]:
vectorized_post = vectorizer.transform(data['FullPost'].iloc[[random_id]])

array = lda_model.transform(vectorized_post)

best_topic = np.argmax(array)

word_freq = {}

for k, l in enumerate(lda_model.components_[best_topic]):
    word_freq[feature_names[k]]=l

In [None]:
#Get 5 top words of best topics to suggest tags on a post

suggested_tags = sorted(word_freq, key=word_freq.get, reverse=True)[:5]

suggested_tags

# Approche supervisée

# Preprocessing

Ici, le problème va être de construire une target exploitable à partir de nos tags. Quelle stratégie adopter ?

In [None]:
data.Tags.value_counts()

On a énormément de labels différents. On va essayer de réduire le nombre de labels en conservant les 3 tags les plus pertinents pour chaque post.

In [None]:
text = ''

for e in data['Tags']:
    text += e
    
list_tags = text.split(' ')

In [None]:
count_tags = pd.Series(list_tags).value_counts()
    
tags_df=pd.DataFrame(columns=['Tag', 'Count'])
tags_df['Tag']=list(count_tags.keys())
tags_df['Count']=list(count_tags.values)

In [None]:
tags_df.head()

In [None]:
def select_tags(tags):
    
    tags = tags.split(' ')[:-1]
    
    count_dic = {}
    for e in tags:
        count_dic[e] = int(tags_df[tags_df['Tag'] == e].Count)
        
    res = sorted(count_dic, key=count_dic.get, reverse=True)[:3]
    
    #On ordonne la liste par ordre alphabétique pour plus de lisibilité
    res.sort()
    
    return res
    

In [None]:
data['main_tags'] = data['Tags'].apply(select_tags)

In [None]:
data.head()

On va maintenant utiliser un MultiLabel Binarizer pour encoder notre variable cible.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

Y = mlb.fit_transform(data['main_tags'])

In [None]:
Y.shape

In [None]:
#Utiliser MLB plutôt que get_dummies (inverse_transformer)

res = mlb.inverse_transform(Y)

In [None]:
X = data['FullPost']

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X, Y,test_size=0.2)

In [None]:
X_train.head()

In [None]:
tfidf_vectorizer = TfidfVectorizer()
preprocessed_X_train = tfidf_vectorizer.fit_transform(X_train)

In [None]:
y_train

In [None]:
preprocessed_X_train

In [None]:
X_sample = preprocessed_X_train[0:3000]
y_sample = y_train[0:3000]

In [None]:
#Multiclass multioutput --> Random forest classifier ?

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_sample, y_sample)

In [None]:
rfc.score(X_sample, y_sample)

In [None]:
preprocessed_X_test = tfidf_vectorizer.transform(X_test)

In [None]:
rfc.score(preprocessed_X_test, y_test)

In [None]:
### TO DO

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [None]:
"""
LGBM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('lgbm', OneVsRestClassifier(LGBMClassifier(), n_jobs=1)),
            ])
"""

In [None]:
"""
LGBM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('lgbm', OneVsRestClassifier(LGBMClassifier(), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
"""

In [None]:
#labels = mlb.inverse_transform(y_train)

In [None]:
#utiliser inverse_transform sur les prédictions

In [None]:
#TF IDF pour vectoriser data['Full_text'] et data['main_tags'] puis LightGBM/RFC

In [None]:
# Word2VEC

In [None]:
#1 versus Rest pour sélection du modèle supervisé

# Sentence embedding

### En utilisant le texte après Feature Engineering

In [None]:
# On utilise sentence transformer

from sentence_transformers import util, SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [None]:
sentences = ["This is an example sentence", "Each sentence is converted"]

embeddings = model.encode(sentences)

In [None]:
preprocessed_X = data['FullPost'].tolist()

embeddings = model.encode(preprocessed_X)

In [None]:
top_words = tags_df['Tag'].tolist()[0:3000]

word_embeddings = model.encode(top_words)

In [None]:
cosine_scores = util.cos_sim(embeddings, word_embeddings)

In [None]:
for i in range(10):
    print('POST :')
    print(' ')
    print(preprocessed_X[i])
    print(' ')
    print('SUGGESTED TAGS :')
    print(' ')
    
    best_tags = zip([int(e) for e in cosine_scores[i].sort(descending=True)[1][0:5]],
          [float(e) for e in cosine_scores[i].sort(descending=True)[0][0:5]])
    
    for j, k in best_tags:
        print(top_words[j], k)
    
    #print(sample_words[int(cosine_scores[i].argmax())], float(cosine_scores[i].max()))
    print(' ')
    
    print('ORIGINAL TAGS :')
    print(' ')
    print(data['Tags'][i])
    
    print('""""""""""""""""""""')

### En utilisant le langage naturel

In [None]:
# On recharge les données initiales

In [None]:
raw_data = pd.read_csv('data/QueryV2.csv')

raw_data['FullPost'] = raw_data['Title'] + ' ' + raw_data['Body'][0][3:-5]

In [None]:
raw_data.head()

In [None]:
data['Tags'][0]

In [None]:
raw_data['FullPost'] = raw_data['Title'] + ' ' + raw_data['Body'][0][3:-5]

In [None]:
raw_X = raw_data['FullPost'] .tolist()

raw_embeddings = model.encode(raw_X)

In [None]:
raw_cosine_scores = util.cos_sim(raw_embeddings, word_embeddings)

In [None]:
for i in range(10):
    print('POST :')
    print(' ')
    print(raw_X[i])
    print(' ')
    print('SUGGESTED TAGS :')
    print(' ')
    
    best_tags = zip([int(e) for e in raw_cosine_scores[i].sort(descending=True)[1][0:5]],
          [float(e) for e in raw_cosine_scores[i].sort(descending=True)[0][0:5]])
    
    for j, k in best_tags:
        print(top_words[j], k)
    
    #print(sample_words[int(cosine_scores[i].argmax())], float(cosine_scores[i].max()))
    print(' ')
    
    print('ORIGINAL TAGS :')
    print(' ')
    print(data['Tags'][i])
    
    print('""""""""""""""""""""')

In [None]:
### TO DO : TRY DIFFERENT MODELS ?

### BERT

In [None]:
raw_data['Body'][0][3:-5]

### USE

### GLOVE

# Evaluation 

In [None]:
#Méthodes d'évaluation et métriques --> Score jaccard car multi_label

In [None]:
#Enregistrement de la pipeline de transformation et du modèle pour réemployer dans le code final à déployer

# Démonstration avec API

In [None]:
#Stream lite ou Gradio