# Notebook de test 

Dans ce notebook, nous allons tester les différents modèles qui pourraient être intéressant pour prédire les tags associés à une question de stackoverflow.

Dans un premier temps, nous importons à notre habitude les librairies nécessaires à nos tests de modélisation.

In [None]:
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
import nltk 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.python.distribute.multi_process_runner import multiprocessing
from matplotlib import pyplot as plt
from gensim.models import TfidfModel
from gensim import corpora, models
from gensim.models import CoherenceModel
from sklearn.decomposition import TruncatedSVD
import pyLDAvis.gensim_models
from gensim.corpora import Dictionary

nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('omw-1.4')

Nous allons désormais lire notre dataset et garder uniquement les colonnes qui nous intéressent à savoir Title, Body et Tags.

In [None]:
data = pd.read_csv("../data/data_clean.csv")
data = data[["Title", "Body", "Tags"]]
data.reset_index(inplace=True)
data.drop(columns='index', inplace=True)
data.head()

Maintenant, nous allons constituer notre corpus qui sera créé à partir du titre, du corps et des différents tags pour chaque document.

In [None]:
data['Post'] = data.apply(lambda x: (x['Title'] + ' ' + x['Body'] + ' ' + x['Tags'] if x['Title'] == x['Title'] else x['Body']).lower(), axis=1)

Nous pouvons maintenant décomposer notre corpus en tokens grâce à la méthode wordpunc qui permettra également de supprimer la ponctuation.

In [None]:
data['Tokens'] = data.apply(lambda x: wordpunct_tokenize(x['Post']), axis=1)

Vérifions le nombre d'occurrences à des langages de programmation diffère dans l'ensemble de posts. Nous pourrions afficher un graphique l'illustrant et mieux comprendre la proportion d'utilisation des langages de programmation

In [None]:
most_used_programming_language = [
    "javascript",
    "js",
    "python",
    "py",
    "go",
    "golang",
    "java",
    "kotlin",
    "php",
    "csharp"
    "c#",
    "swift",
    "net",
    "core", 
    "rb",
    "ruby",
    "c",
    "c++",
    "cpp",
    "matlab",
    "typescript",
    "ts",
    "scala",
    "html",
    "css",
    "rust",
    "rs",
    "perl"
]

data.head()

In [None]:
def token_plot(tokens):
    token_count = {
        "javascript": 0,
        "js": 0,
        "python": 0,
        "py": 0,
        "go": 0,
        "golang": 0,
        "java": 0,
        "kotlin": 0,
        "php": 0,
        "csharp": 0,
        "c#": 0,
        "swift": 0,
        "net": 0,
        "core": 0,
        "rb": 0,
        "ruby": 0,
        "c": 0,
        "c++": 0,
        "cpp": 0,
        "matlab": 0,
        "typescript": 0,
        "ts": 0,
        "scala": 0,
        "html": 0,
        "css": 0,
        "rust": 0,
        "rs": 0,
        "perl": 0
    }

    for token_series in tokens:
        for token in token_series:
            if token in most_used_programming_language:
                token_count[token] += 1

    df = pd.DataFrame.from_dict(token_count, orient='index')
    df.plot(kind='bar', color="#f56900", title='Top des langages de programmation les plus cités')

token_plot(data['Tokens'])

Bien, nous avons une bonne idée des principaux éléments et du champ lexical qui pourrait être présent.

À première vue, le langage sera très performant pour comprendre le champ lexical autour du C et du Java est sûrement beaucoup moins sûr celui de Rust par exemple.

# StopWords

Nous allons utiliser le vocabulaire anglais et la le stopwords de ntlk pour supprimer l'ensemble des stopwords. Puis appliquons le sur l'ensemble des tokens.

In [None]:
stop_words = set(stopwords.words('english'))

def delete_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]   
     
data['Tokens'] = data.apply(lambda x: delete_stopwords(x['Tokens']), axis=1) 

data.head()

Vérifions la proportion des langages de programmation pour s'assurer que les langages de programmation n'ont pas été supprimer par le stopwords.

In [None]:
token_plot(data['Tokens'])

# Lemmatize

Appliquons désormais un processus de lemmatisation sur nos tokens pour ne garder que l'infinitif des verbes.

In [None]:
def lemmatize_series(token_series):
    lemmatizer = WordNetLemmatizer()
    tokens = []
    for token in token_series:
        tokens.append(lemmatizer.lemmatize(token))
    return tokens

def lemmatize_str(str): 
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(str) 

data['TokensLem'] = data.apply(lambda x: lemmatize_series(x['Tokens']), axis=1)

In [None]:
data.head()

# Bag Of Word

Appliquons maintenant un Bag of word et un TF-IDT à nos Tokens.

In [None]:
def bag_of_word(x):
    cv = CountVectorizer(stop_words='english')
    bow = cv.fit_transform(x['Tokens'])
    return bow 

data['BOW'] = data.apply(lambda x: bag_of_word(x), axis=1)

# TF-IDT

In [None]:
tfidf = TfidfVectorizer()  

data['Tfidt'] = data.apply(lambda x: tfidf.fit_transform(x['TokensLem']), axis=1)

print(tfidf.get_feature_names())
data['Tokens'].head()

Nous voyons ici les principales features trouvées par le Tf IDT

# Word2Vec

Implémentons notre premier modèle le Word2Vec :

In [None]:
w2v_size=300
w2v_window=5
w2v_min_count=1
w2v_epochs=100
maxlen = len(data['Post']) # adapt to length of sentences
sentences = data['Post'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

print("Build & train Word2Vec model ...")
workers = multiprocessing.cpu_count()
w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=workers)
 
w2v_model.build_vocab(data['Tokens'])
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
model_vectors = w2v_model.wv
w2v_words = model_vectors.index_to_key  
print("Vocabulary size: %i" % len(w2v_words))
print("Word2Vec trained")

In [None]:
s = "I the best developer in the world but i destroy my git commit history with rebase for cpp code, can u help me ?"
s = s.lower()
s = lemmatize_str(s)
s = wordpunct_tokenize(s)
s = delete_stopwords(s)
s = w2v_model.wv.most_similar(s)
print(s)

# Use

Implémentation du modèle USE : 

In [None]:
def cos_sim(input_vectors):
    similarity = cosine_similarity(input_vectors)
    return similarity

def get_top_similar(index, sentence_list, similarity_matrix, topN):
   similarity_row = np.array(similarity_matrix[index, :])
   indices = similarity_row.argsort()[-topN:][::-1]
   return [sentence_list[i] for i in indices]

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

sentences_embeddings = embed(data['Post'])

similarity_matrix = cos_sim(np.array(sentences_embeddings))

top_similar = get_top_similar(0, data['Post'], similarity_matrix, 3)

for x in range(len(top_similar)):
    print("----")
    print(top_similar[x])

# Bert

Implémentons le modèle BERT : 

In [None]:
BERT_MODEL = "https://tfhub.dev/google/experts/bert/wiki_books/2"
PREPROCESS_MODEL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
import tensorflow_text
preprocess = hub.load(PREPROCESS_MODEL)
bert = hub.load(BERT_MODEL)
bert_inputs = preprocess(data['Post'].head(150))

bert_outputs = bert(bert_inputs, training=False)
pooled_output = bert_outputs['pooled_output']
sequence_output = bert_outputs['sequence_output']

print('\nSentences:')
print(data['Post'][0])
print('\nPooled output:')
print(pooled_output[0])
print('\nSequence output:')
print(sequence_output[0])

# Réduction dimensionnelle

Nous allons appliquer une réduction dimensionnelle à notre TfIdt : 

In [None]:
def make_pca(x):
    pca = TruncatedSVD(n_components=2)
    X_reduced_train = pca.fit_transform(x['Tfidt'])
    return X_reduced_train

data['Tfidt_PCA'] = data.apply(lambda x: make_pca(x), axis=1)

# LDA

Nos tests ne semblent pas concluant avec Word2Vec, Bert et USE... Nous allons essayer d'implémenter une LDA. 

In [None]:
def make_lda(posts, num_topic):
    dictionary = corpora.Dictionary(posts)
    dictionary.filter_extremes(no_below=1000)
    bow_corpus = [dictionary.doc2bow(text) for text in posts]
    tfidf = TfidfModel(bow_corpus)
    tfidf_corpus = [tfidf[text] for text in bow_corpus]

    ldamodel = gensim.models.ldamodel.LdaModel(tfidf_corpus, num_topics=num_topic, id2word = dictionary, passes=20)
    coherencemodel = CoherenceModel(model=ldamodel, texts=posts, dictionary=dictionary, coherence='c_v')
    return coherencemodel.get_coherence()
    

score_lda = []
for i in range(1, 16):
    score_lda.append(make_lda(sentences, i))

print(score_lda)

Le tableau de score semble plus intéressant que les précédents modèles.

Analysons ses informations sur une courbe.

In [None]:
limit=17; start=2; step=1
x = range(start, limit, step)
plt.plot(x, score_lda)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

Nous pouvons voir que le nombre idéal de topic est autour de 7

In [None]:
dictionary = corpora.Dictionary(sentences)
dictionary.filter_extremes(no_below=1000)
bow_corpus = [dictionary.doc2bow(text) for text in sentences]
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = [tfidf[text] for text in bow_corpus]

ldamodel = gensim.models.ldamulticore.LdaMulticore(tfidf_corpus, num_topics=7, id2word = dictionary, passes=20)
lda_visualization = pyLDAvis.gensim_models.prepare(ldamodel, tfidf_corpus, dictionary, sort_topics=False)

topics = ldamodel.get_document_topics(bow_corpus) #
pyLDAvis.display(lda_visualization) 

In [None]:
def predict_lda(x):
    bow_vector = dictionary.doc2bow(x['Tokens'])
    return ldamodel.get_document_topics(bow_vector)
 
doc_topic = predict_lda(data.iloc[9]) 
alL_topic = ldamodel.get_topics() 

for n, t in doc_topic:
    topic_most_pr = alL_topic[n].argmax()
    print("doc: {} topic: {}\n prob : ".format(n, topic_most_pr, t))
    topic_name = ldamodel.print_topic(n, 5) 
    print(topic_name)

# CBOW

Implémentation du modèle CBow : 

In [None]:
words = [ ' '.join(token) for token in data['Tokens'].head(100) ]  
words = ' '.join(words) 
words = words.split(" ")
print(words[0])
print(words[1])
vocab = set(words)
vocab_size = len(vocab)
embed_dim = 10
context_size = 2

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

data = []
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1], words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target)) 
print(data[:5])

embeddings =  np.random.random_sample((vocab_size, embed_dim))

def linear(m, theta):
    w = theta
    return m.dot(w)

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

def NLLLoss(logs, targets):
    out = logs[range(len(targets)), targets]
    return -out.sum()/len(out)

def log_softmax_crossentropy_with_logits(logits,target):

    out = np.zeros_like(logits)
    out[np.arange(len(logits)),target] = 1
    
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    
    return (- out + softmax) / logits.shape[0]

def forward(context_idxs, theta):
    m = embeddings[context_idxs].reshape(1, -1)
    n = linear(m, theta)
    o = log_softmax(n)
    
    return m, n, o

def backward(preds, theta, target_idxs):
    m, n, o = preds
    
    dlog = log_softmax_crossentropy_with_logits(n, target_idxs)
    dw = m.T.dot(dlog)
    
    return dw

def optimize(theta, grad, lr=0.03):
    theta -= grad * lr
    return theta

theta = np.random.uniform(-1, 1, (2 * context_size * embed_dim, vocab_size))

epoch_losses = {}

for epoch in range(80):

    losses =  []

    for context, target in data:
        context_idxs = np.array([word_to_ix[w] for w in context])
        preds = forward(context_idxs, theta)

        target_idxs = np.array([word_to_ix[target]])
        loss = NLLLoss(preds[-1], target_idxs)

        losses.append(loss)

        grad = backward(preds, theta, target_idxs)
        theta = optimize(theta, grad, lr=0.03)
        
     
    epoch_losses[epoch] = losses 

ix = np.arange(0,80)

fig = plt.figure()
fig.suptitle('Epoch/Losses', fontsize=20)
plt.plot(ix,[epoch_losses[i][0] for i in ix])
plt.xlabel('Epochs', fontsize=12)
plt.ylabel('Losses', fontsize=12) 

def predict(words):
    context_idxs = np.array([word_to_ix[w] for w in words])
    preds = forward(context_idxs, theta)
    word = ix_to_word[np.argmax(preds[-1])]
    
    return word 