# Import des librairies et des données nettoyées

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/clean_data.csv', index_col=[0])

In [3]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,TitleLength,FirstTag,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,2,c,directory listing scan directory folder file n...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,4,c#,return datatables wcf dotnet wcf service want ...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,4,.net,difference struct class dotnet difference stru...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,3,php,mysqli pdo pro place split use mysqli pdo stuf...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,4,javascript,scroll overflow divs javascript div use overfl...


In [4]:
data.drop(['FirstTag', 'TitleLength'], axis=1, inplace=True)

In [5]:
def replace_nan_with_empty_string(text):
    if type(text) != str:
        if math.isnan(text):
            return ' '
    else:
        return text

In [6]:
data['Title'] = data['Title'].apply(replace_nan_with_empty_string)
data['Body'] = data['Body'].apply(replace_nan_with_empty_string)
data['Code'] = data['Code'].apply(replace_nan_with_empty_string)

In [7]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,FullPost
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,directory listing scan directory folder file n...
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,return datatables wcf dotnet wcf service want ...
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,difference struct class dotnet difference stru...
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,mysqli pdo pro place split use mysqli pdo stuf...
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,scroll overflow divs javascript div use overfl...


# Approche non supervisée : Latent Dirichlet Allocation

In [None]:
vectorizer = CountVectorizer()
vectorized_text = vectorizer.fit_transform(data['FullPost'])
feature_names = vectorizer.get_feature_names()

In [None]:
lda_model = LatentDirichletAllocation(n_components=100,
                                      max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      n_jobs=4,
                                      random_state=0).fit(vectorized_text)

In [None]:
def sort_coo(coo_matrix):
    """Sort the values in the TFIDF matrix in descending order"""
    tuples = zip(coo_matrix.col, coo_matrix.data) # FULL POST ?????????????
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
  
# Extract the top n words from each topic  
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results
        
def list_topics(model, feature_names, no_top_words):
    """Build a dictionary of topics' main features"""
    topic_dic={}
    for idx, topic in enumerate(model.components_):
        topic_dic[idx]=" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    return topic_dic

In [None]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=0.001)
tfidf_text = tfidf_vectorizer.fit_transform(data['FullPost'])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
topics = list_topics(lda_model, feature_names, 150)

tags = set()

for i in range(len(topics)):
    vector=vectorizer.transform([topics[i]])
    sorted_items=sort_coo(vector.tocoo())
    tags = tags.union(set(k for k in extract_topn_from_vector(feature_names,sorted_items,20)))

In [None]:
tag_vectorizer= CountVectorizer()
tag_CV=tag_vectorizer.fit_transform(data['Tags'])
tag_names=tag_vectorizer.get_feature_names()

In [None]:
print("\nNumber of Extracted Tags:")
print(len(tags))

print("\nTags found in both the extracted tags and the tag column of the dataset:")

cross_tags= [tag for tag in tag_names if tag in tags]
print(len(cross_tags))
print(sorted(cross_tags))

In [None]:
wordcloud = WordCloud(mode="RGBA", background_color=None, max_words=50)

fig = plt.figure(figsize=(20,100))
fig.subplots_adjust(hspace=0.1, wspace=0.001)

topic = 0
for i in range(25):
    for j in range(4):
        freq={}
        for k, l in enumerate(lda_model.components_[topic]):
            freq[feature_names[k]]=l
        wordcloud.generate_from_frequencies(freq)
        plt.subplot2grid((25,4), (i,j))
        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        topic+=1
plt.show()

### Test de suggestion de tags sur un Post pris au hasard

In [None]:
from random import randint

random_id = randint(0, data['FullPost'].shape[0] - 1)
data.iloc[[random_id]]

In [None]:
vectorized_post = vectorizer.transform(data['FullPost'].iloc[[random_id]])

array = lda_model.transform(vectorized_post)

best_topic = np.argmax(array)

word_freq = {}

for k, l in enumerate(lda_model.components_[best_topic]):
    word_freq[feature_names[k]]=l

In [None]:
#Get 5 top words of best topics to suggest tags on a post

suggested_tags = sorted(word_freq, key=word_freq.get, reverse=True)[:5]

suggested_tags

# Approche supervisée

# Preprocessing

Ici, le problème va être de construire une target exploitable à partir de nos tags. Quelle stratégie adopter ?

In [8]:
data.Tags.value_counts()

javascript jquery html css twitter-bootstrap                28
java spring jpa spring-data spring-data-jpa                 22
python apache-spark dataframe pyspark apache-spark-sql      19
python image opencv image-processing computer-vision        15
java json spring spring-mvc jackson                         12
                                                            ..
algorithm sorting big-o quicksort heapsort                   1
c# .net wcf c#-4.0 channelfactory                            1
jquery html jquery-ui user-interface jquery-ui-sortable      1
python r haskell floating-point julia                        1
scala monads scalaz applicative scala-cats                   1
Name: Tags, Length: 48950, dtype: int64

On a énormément de labels différents. On va essayer de réduire le nombre de labels en conservant les 3 tags les plus pertinents pour chaque post.

In [9]:
text = ''

for e in data['Tags']:
    text += e
    
list_tags = text.split(' ')

In [10]:
count_tags = pd.Series(list_tags).value_counts()
    
tags_df=pd.DataFrame(columns=['Tag', 'Count'])
tags_df['Tag']=list(count_tags.keys())
tags_df['Count']=list(count_tags.values)

In [11]:
tags_df.head()

Unnamed: 0,Tag,Count
0,java,5799
1,python,5261
2,c#,5229
3,javascript,4833
4,ios,4245


In [12]:
def select_tags(tags):
    
    tags = tags.split(' ')[:-1]
    
    count_dic = {}
    for e in tags:
        count_dic[e] = int(tags_df[tags_df['Tag'] == e].Count)
        
    res = sorted(count_dic, key=count_dic.get, reverse=True)[:3]
    
    #On ordonne la liste par ordre alphabétique pour plus de lisibilité
    res.sort()
    
    return res
    

In [13]:
data['main_tags'] = data['Tags'].apply(select_tags)

In [14]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,Code,FullPost,main_tags
0,directory listing,scan directory folder file need cross platform,c file directory cross-platform common-tasks,12489,65,135968,21,9,,directory listing scan directory folder file n...,"[c, directory, file]"
1,return datatables wcf dotnet,wcf service want return datatable know highly ...,c# .net wcf web-services datatable,12702,51,56176,12,8,datacontract public datatable gettbl ...,return datatables wcf dotnet wcf service want ...,"[.net, c#, web-services]"
2,difference struct class dotnet,difference struct class dotnet,.net class struct value-type reference-type,13049,830,453890,296,19,,difference struct class dotnet difference stru...,"[.net, class, struct]"
3,mysqli pdo pro,place split use mysqli pdo stuff prepared stat...,php mysql pdo mysqli database-abstraction,13569,342,143841,284,13,,mysqli pdo pro place split use mysqli pdo stuf...,"[mysql, pdo, php]"
4,scroll overflow divs javascript,div use overflow auto content inside div resiz...,javascript jquery ajax html scroll,13362,49,67437,10,6,thediv scrolltop thediv scrollhe...,scroll overflow divs javascript div use overfl...,"[html, javascript, jquery]"


On va maintenant utiliser un MultiLabel Binarizer pour encoder notre variable cible.

In [15]:
mlb = MultiLabelBinarizer()

Y = mlb.fit_transform(data['main_tags'])

In [16]:
X = data['FullPost']

In [17]:
X_train, X_test, y_train, y_test= train_test_split(X, Y,test_size=0.2)

In [18]:
tfidf_vectorizer = TfidfVectorizer()
preprocessed_X_train = tfidf_vectorizer.fit_transform(X_train)

### En utilisant une approche multiclass multioutput avec RandomForestClassifier

In [19]:
rfc = RandomForestClassifier(bootstrap=True,
                             max_depth=40,
                             max_features='sqrt',
                             random_state=42)

rfc.fit(preprocessed_X_train, y_train)

RandomForestClassifier(max_depth=40, max_features='sqrt', random_state=42)

In [20]:
pred_train = rfc.predict(preprocessed_X_train)

In [21]:
jaccard_score(pred_train, y_train, average='samples')

0.003183333333333333

In [22]:
preprocessed_X_test = tfidf_vectorizer.transform(X_test)

In [23]:
pred = rfc.predict(preprocessed_X_test)

In [24]:
jaccard_score(pred, y_test, average='samples')

0.0004999999999999999

In [25]:
for i in range(10):
    print(' ')
    print('POST :')
    print(' ')
    print(tfidf_vectorizer.inverse_transform(preprocessed_X_test[i]))
    print(' ')
    print('SUGGESTED TAGS :')
    print(' ')
    
    pred = rfc.predict(preprocessed_X_test[i])
    transformed_pred = mlb.inverse_transform(pred)
    print(transformed_pred)
    
    print('ORIGINAL TARGETS :')
    print(' ')
    print(mlb.inverse_transform(y_test)[i])
    print(' ')
    print('""""""""""""""""""""')

 
POST :
 
[array(['access', 'accessible', 'add', 'address', 'advance', 'app',
       'application', 'article', 'bit', 'case', 'client', 'consume',
       'consuming', 'controller', 'dig', 'documentation', 'engine',
       'expose', 'figure', 'good', 'helper', 'important', 'info',
       'isolated', 'issue', 'kind', 'know', 'method', 'pain', 'parent',
       'rail', 'rb', 'ruby', 'same', 'should', 'solution', 'use', 'want',
       'will', 'work'], dtype='<U96')]
 
SUGGESTED TAGS :
 
[()]
ORIGINAL TARGETS :
 
('rails-engines', 'ruby-on-rails', 'ruby-on-rails-3.1')
 
""""""""""""""""""""
 
POST :
 
[array(['abstract', 'array', 'background', 'car', 'class', 'cod', 'code',
       'create', 'different', 'dynamically', 'excerpt', 'follow',
       'forward', 'home', 'instance', 'instantiate', 'key', 'mean',
       'method', 'object', 'objective', 'pass', 'possible', 'problem',
       'scripting', 'see', 'set', 'should', 'simply', 'straight', 'type',
       'use', 'user', 'value', 'variables',

### En utilisant OneVSRestClassifier et LGBM

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
ovr = OneVsRestClassifier(lgbm)

In [None]:
ovr.fit(preprocessed_X_train, y_train)

In [None]:
pred_ovr_train = ovr.predict(preprocessed_X_train)

In [None]:
jaccard_score(pred_ovr_train, y_train, average='samples')

In [None]:
pred_ovr = ovr.predict(preprocessed_X_test)

In [None]:
jaccard_score(pred_ovr, y_test, average='samples')

# Sentence embedding

In [33]:
from sentence_transformers import util, SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

### On utilise le langage naturel

In [34]:
# On recharge les données initiales

raw_data = pd.read_csv('data/QueryV2.csv')

In [35]:
# On va tout de même appliquer un filtre pour supprimer les balises du champ Body
# puis concaténer Title et Body pour obtenir un champ FullPost

to_replace_substrings = ['<p>', '</p>',
                         '<a>', '</a>',
                         '<pre>', '</pre>',
                         '<code>', '</code>',
                         '<blockquote>', '</blockquote>',
                         '<em>', '</em>',
                         '<strong>', '</strong>',
                         '<br>', '</br>',
                         '<li>', '</li>',
                         '<ol>', '</ol>'                 
                        ]


def tag_droper(text):
    for sub in to_replace_substrings:
        text = text.replace(sub, '')
    return text
    
raw_data['Body'] = raw_data['Body'].apply(tag_droper)
raw_data['FullPost'] = raw_data['Title'] + ' ' + raw_data['Body']

In [36]:
#On récupère la colonne main_tags

raw_data['main_tags'] = data['main_tags']

In [37]:
raw_data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount,FullPost,main_tags
0,How do you get a directory listing in C?,How do you scan a directory for folders and fi...,<c><file><directory><cross-platform><common-ta...,12489,65,135968,21,9,How do you get a directory listing in C? How d...,"[c, directory, file]"
1,Returning DataTables in WCF/.NET,I have a WCF service from which I want to retu...,<c#><.net><wcf><web-services><datatable>,12702,51,56176,12,8,Returning DataTables in WCF/.NET I have a WCF ...,"[.net, c#, web-services]"
2,What's the difference between struct and class...,What's the difference between struct and class...,<.net><class><struct><value-type><reference-type>,13049,830,453890,296,19,What's the difference between struct and class...,"[.net, class, struct]"
3,mysqli or PDO - what are the pros and cons?,In our place we're split between using mysqli ...,<php><mysql><pdo><mysqli><database-abstraction>,13569,342,143841,284,13,mysqli or PDO - what are the pros and cons? In...,"[mysql, pdo, php]"
4,Scrolling Overflowed DIVs with JavaScript,I've got a div that uses overflow:auto to keep...,<javascript><jquery><ajax><html><scroll>,13362,49,67437,10,6,Scrolling Overflowed DIVs with JavaScript I've...,"[html, javascript, jquery]"


In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X = raw_data['FullPost'].tolist()

In [40]:
Y = raw_data['main_tags']

In [41]:
mlb_embeddings = MultiLabelBinarizer()

target = mlb_embeddings.fit_transform(Y)

In [42]:
all_tags = Y.apply(pd.Series).stack().drop_duplicates().reset_index(drop=True)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2)

In [44]:
embeddings = model.encode(X)

In [45]:
words_embeddings = model.encode(all_tags)

In [47]:
cosine_scores = util.cos_sim(embeddings, words_embeddings)

In [48]:
for i in range(10):
    print('POST :')
    print(' ')
    print(X[i])
    print(' ')
    print('SUGGESTED TAGS :')
    print(' ')
    
    best_tags = zip([int(e) for e in cosine_scores[i].sort(descending=True)[1][0:3]],
          [float(e) for e in cosine_scores[i].sort(descending=True)[0][0:3]])
    
    for j, k in best_tags:
        print(all_tags[j], k)

    print(' ')
    
    print('ORIGINAL TAGS :')
    print(' ')
    print(Y[i])
    print(' ')
    print('""""""""""""""""""""')

POST :
 
How do you get a directory listing in C? How do you scan a directory for folders and files in C? It needs to be cross-platform.

 
SUGGESTED TAGS :
 
ansi-c 0.4678370952606201
scanf 0.45970940589904785
directory-structure 0.4547803997993469
 
ORIGINAL TAGS :
 
['c', 'directory', 'file']
 
""""""""""""""""""""
POST :
 
Returning DataTables in WCF/.NET I have a WCF service from which I want to return a DataTable. I know that this is often a highly-debated topic, as far as whether or not returning DataTables is a good practice. Let's put that aside for a moment.

When I create a DataTable from scratch, as below, there are no problems whatsoever. The table is created, populated, and returned to the client, and all is well:

[DataContract]
public DataTable GetTbl()
{
    DataTable tbl = new DataTable("testTbl");
    for(int i=0;i&lt;100;i++)
    {
        tbl.Columns.Add(i);
        tbl.Rows.Add(new string[]{"testValue"});
    }
    return tbl;
}


However, as soon as I go out and 

In [49]:
list_pred = pd.Series([[all_tags[int(e)] for e in cosine_scores[i].sort(descending=True)[1][0:3]] for i in range(len(X))])

In [50]:
pred = mlb_embeddings.transform(list_pred)

In [51]:
jaccard_score(pred, target, average='samples')

0.077256

### En utilisant un modèle plus léger

In [None]:
model_light = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
embeddings_light = model_light.encode(X)

In [None]:
words_embeddings_light = model_light.encode(all_tags)

In [None]:
cosine_scores_light = util.cos_sim(embeddings_light, word_embeddings_light)

In [None]:
for i in range(10):
    print('POST :')
    print(' ')
    print(X[i])
    print(' ')
    print('SUGGESTED TAGS :')
    print(' ')
    
    best_tags = zip([int(e) for e in cosine_scores_light[i].sort(descending=True)[1][0:3]],
          [float(e) for e in cosine_scores_light[i].sort(descending=True)[0][0:3]])
    
    for j, k in best_tags:
        print(all_tags[j], k)

    print(' ')
    
    print('ORIGINAL TAGS :')
    print(' ')
    print(Y[i])
    print(' ')
    print('""""""""""""""""""""')

In [None]:
list_pred_light = pd.Series([[all_tags[int(e)] for e in cosine_scores_light[i].sort(descending=True)[1][0:3]] for i in range(len(X))])

In [None]:
pred_light = mlb_embeddings.transform(list_pred_light)

In [None]:
jaccard_score(pred_light, target, average='samples')

# Enregistrement du modèle en vue de la démonstration

Il va falloir enregistrer le modèle, le multilabel binarizer et les word_embeddings.

In [None]:
import pickle

In [None]:
pickle.dump(model, open('model/model', 'wb'))
pickle.dump(mlb_embeddings, open('model/mlb_embeddings', 'wb'))
pickle.dump(words_embeddings, open('model/words_embeddings', 'wb'))
pickle.dump(all_tags, open('model/all_tags', 'wb'))