# Import des librairies et des données nettoyées

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = pd.read_csv('clean_data.csv', index_col=[0])

In [3]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,TitleLength,FirstTag,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,2,c,directory listing scan directory folder file n...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,4,c#,return datatables wcf dotnet wcf service want ...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,4,.net,difference struct class dotnet difference stru...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,3,php,mysqli pdo pro place split use mysqli pdo stuf...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,4,javascript,scroll overflow divs javascript div use overfl...


In [4]:
data.drop(['FullPost', 'FirstTag', 'TitleLength'], axis=1, inplace=True)

In [5]:
def replace_nan_with_empty_string(text):
    if type(text) != str:
        if math.isnan(text):
            return ' '
    else:
        return text

In [6]:
data['Title'] = data['Title'].apply(replace_nan_with_empty_string)
data['Body'] = data['Body'].apply(replace_nan_with_empty_string)
data['Code'] = data['Code'].apply(replace_nan_with_empty_string)

In [7]:
data['FullPost'] = data['Title'] + data['Body'] + data['Code']

In [8]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,directory listingscan directory folder file ne...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,return datatables wcf dotnetwcf service want r...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,difference struct class dotnetdifference struc...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,mysqli pdo proplace split use mysqli pdo stuff...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,scroll overflow divs javascriptdiv use overflo...


# Approche non supervisée : Latent Dirichlet Allocation

In [10]:
vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(data['FullPost'])
title_feature_names = vectorizer.get_feature_names()



In [None]:
lda_model = LatentDirichletAllocation(n_components=25,
                                      max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      n_jobs=4,
                                      random_state=0).fit(vectorized_text)

In [None]:
wordcloud = WordCloud(mode="RGBA", background_color=None, max_words=50)
fig = plt.figure(figsize=(30, 30))
fig.subplots_adjust(hspace=0.01, wspace=0.1)
for k in range(0, 25):    
    freq={}
    for i,j in enumerate(lda_title.components_[k]):
        freq[title_feature_names[i]]=j

    wordcloud.generate_from_frequencies(freq)
    sp=331+k
    plt.subplot(sp)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

In [None]:
#CountVectorizer ?

In [None]:
title_vectorizer= CountVectorizer()
title_CV=title_vectorizer.fit_transform(data['Title'])
title_feature_names=title_vectorizer.get_feature_names()

In [None]:
no_dummytags = 100

lda_title = LatentDirichletAllocation(n_components=no_dummytags, max_iter=5, learning_method='online', learning_offset=50., n_jobs=4,random_state=0).fit(title_CV)

In [None]:
wordcloud = WordCloud(mode="RGBA", background_color=None, max_words=50)
fig = plt.figure(figsize=(30, 30))
fig.subplots_adjust(hspace=0.01, wspace=0.1)
for k,topic in enumerate([6,10,21,23,24,28,39,51,58]):    
    freq={}
    for i,j in enumerate(lda_title.components_[topic]):
        freq[title_feature_names[i]]=j

    wordcloud.generate_from_frequencies(freq)
    sp=331+k
    plt.subplot(sp)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")

In [None]:
# Sort the values in the TFIDF matrix in descending order
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
  
# Extract the top n words from each topic  
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results

# Build a dictionary of topics' main features
        
def list_topics(model, feature_names, no_top_words):
    topic_dic={}
    for idx, topic in enumerate(model.components_):
        topic_dic[idx]=" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dic

In [None]:
title_vectorizer= TfidfVectorizer(sublinear_tf=True, min_df=0.001)

title_tfidf= title_vectorizer.fit_transform(data['Title'])

feature_names=title_vectorizer.get_feature_names()

title_topics= list_topics(lda_title, title_feature_names, 150)

title_tags=set()

for i in range(len(title_topics)):
  
    tf_idf_vector=title_vectorizer.transform([title_topics[i]])

    sorted_items=sort_coo(tf_idf_vector.tocoo())

    title_tags=title_tags.union(set(k for k in extract_topn_from_vector(feature_names,sorted_items,20)))

tag_vectorizer= CountVectorizer()
tag_CV=tag_vectorizer.fit_transform(data['Tags'])
tag_names=tag_vectorizer.get_feature_names()
    
    
print("\nNumber of Extracted Tags:")
print(len(title_tags))

print("\nTags found in both the extracted tags and the tag column of the dataset:")

cross_tags= [tag for tag in tag_names if tag in title_tags]
print(len(cross_tags))
print(sorted(cross_tags))

# Approche supervisée

# Preprocessing

In [None]:
#Utilisation CountVectorizer ?

In [None]:
tag_vectorizer= CountVectorizer()
tag_CV=tag_vectorizer.fit_transform(data['Tags'])
tag_names=tag_vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word
tag_CV=tag_CV.toarray()
dist = np.sum(tag_CV, axis=0)
sorted_counts=dist.argsort()
tags={}
for i in sorted_counts:
  tags[tag_names[i]]=dist[i]
tags_df=pd.DataFrame(columns=['Tag', 'Count'])
tags_df['Tag']=list(tags.keys())
tags_df['Count']=list(tags.values())
tags_df.sort_values(by=['Count'],ascending=False,inplace=True)
tags_df.shape

tags_df[0:20].plot.bar(x='Tag',y='Count',rot=60,figsize=(15,10))
print("\n------------------Top 20 tags:--------------------------")

In [None]:
def wrong_label_counter(y, y_pred):
  try:
    y=y.values
  except:
    pass
  diff = y - y_pred
  diff[diff==1]=0
  diff=np.abs(diff)
  diff=np.sum(diff,axis=1)
  size=y.shape[0]*y.shape[1]-np.count_nonzero(y)
  return 100*np.sum(diff)/size

def missed_label_counter(y, y_pred):
  try:
    y=y.values
  except:
    pass
  diff = y - y_pred
  diff[diff==-1]=0
  diff=np.abs(diff)
  diff=np.sum(diff,axis=1)
  size=np.count_nonzero(y)
  return 100*np.sum(diff)/size

In [None]:
#Utiliser MLB plutôt que get_dummies (inverse_transformer)

In [None]:
tag_set=set(tags_df.Tag[tags_df['Count']>200])

def select_tags(text):
    text=text.split()
    
    res = [t for t in text if t in tag_set]
    
    if res == []:
        return text
    else:
        return res

data['main_tags']=data['Tags'].copy(deep=True)

data['main_tags']=data['main_tags'].apply(select_tags)

data['main_tags']=data['main_tags'].apply(body_join)

In [None]:
data['main_tags']=data['main_tags'].apply(lambda x: x.split(' '))

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

Y = mlb.fit_transform(data['main_tags'])

In [None]:
data.main_tags

In [None]:
from sklearn.model_selection import train_test_split

data['full_text']=data['Title']+' '+data['Body']+' '+data['Code']

X_train, X_test, y_train, y_test= train_test_split(data, Y,test_size=0.2)

In [None]:
X_train.drop(['Title', 'Body', 'Tags', 'Id', 'Score', 'ViewCount', 'FavoriteCount', 'AnswerCount', 'Code', 'main_tags'], axis=1, inplace=True)

In [None]:
X_train

In [None]:
y_train

In [None]:
#multi-label binarizer sklearn (fit_transform)

In [None]:
X_train

In [None]:
X_train = X_train['full_text'].values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [None]:
X_train['full_text'] = vectorizer.fit_transform(X_train['full_text'])

In [None]:
from lightgbm import LGBMClassifier

In [None]:
X_train.shape

In [None]:
y_train

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
LGBM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('lgbm', OneVsRestClassifier(LGBMClassifier(), n_jobs=1)),
            ])

In [None]:
LGBM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('lgbm', OneVsRestClassifier(LGBMClassifier(), n_jobs=1)),
            ])
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

In [None]:
labels = mlb.inverse_transform(y_train)

In [None]:
l_labels = []
for labels in labels:
    for w in labels:
        if w not in l_labels:
            l_labels.append(w)
        

In [None]:
lgbm = OneVsRestClassifier(LGBMClassifier(), n_jobs=1)

In [None]:
for category in l_labels:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LGBM_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LGBM_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

In [None]:
X_train = X_train.values

In [None]:
y_train

In [None]:
#light GBM

In [None]:
X_test['full_text'] = vectorizer.transform(X_test['full_text'])

In [None]:
X_test.drop(['Title', 'Body', 'Tags', 'Id', 'Score', 'ViewCount', 'FavoriteCount', 'AnswerCount', 'Code', 'main_tags'], axis=1, inplace=True)

In [None]:
X_test.head()

In [None]:
#utiliser inverse_transform sur les prédictions

# Bag of words

In [None]:
#TF IDF pour vectoriser data['Full_text'] et data['main_tags'] puis LightGBM/RFC

In [None]:
# Word2VEC

In [None]:
#1 versus Rest pour sélection du modèle supervisé

# Sentence embedding

In [None]:
#sentence transformer library python

### BERT

In [None]:
import tensorflow_hub as hub

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1"
#module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"

bert_layer = hub.KerasLayer(module_url, trainable=True)

### USE

### GLOVE

# Evaluation 

In [None]:
#Méthodes d'évaluation et métriques --> Score jaccard car multi_label

In [None]:
#Enregistrement de la pipeline de transformation et du modèle pour réemployer dans le code final à déployer

# Démonstration avec API

In [None]:
#Stream lite ou Gradio