In [31]:
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
import nltk 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer 
import gensim
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords') 
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\elect\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\elect\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\elect\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [32]:
data = pd.read_csv("../data/data_clean.csv")
data = data[["Id", "Title", "Body", "Tags"]]
data.reset_index(inplace=True)
data.drop(columns='Id', inplace=True)
data.drop(columns='index', inplace=True)
data.head()

Unnamed: 0,Title,Body,Tags
0,Unable to continue with rebase due to untracke...,I'm currently mergin two branches using git re...,git
1,Progressive Web App on iOS 12.2 stuck in offli...,I installed a custom progressive web app via s...,ios progressive-web-apps
2,__PRETTY_FUNCTION__ in constant expression,Please refer to this snippet:\n\n#include type...,c++ c++17 constant-expression
3,How to fix 'http: named cookie not present' in...,I'm building a small dinner/plan management ap...,http go cookies jwt postman
4,How can I read a file which will be upload fro...,I create a method in my .Net Core API which wi...,c# file upload asp.net-core-webapi


In [33]:
data['Post'] = data.apply(lambda x: (x['Title'] + ' ' + x['Body'] if x['Title'] == x['Title'] else x['Body']).lower(), axis=1)

In [34]:
data['Tokens'] = data.apply(lambda x: wordpunct_tokenize(x['Post']), axis=1)

KeyboardInterrupt: 

In [None]:
most_used_programming_language = [
    "javascript",
    "js",
    "python",
    "py",
    "go",
    "golang",
    "java",
    "kotlin",
    "php",
    "csharp"
    "c#",
    "swift",
    "net",
    "core", 
    "rb",
    "ruby",
    "c",
    "c++",
    "cpp",
    "matlab",
    "typescript",
    "ts",
    "scala",
    "html",
    "css",
    "rust",
    "rs",
    "perl"
]

data.head()

In [None]:
def token_plot(tokens):
    token_count = {
        "javascript": 0,
        "js": 0,
        "python": 0,
        "py": 0,
        "go": 0,
        "golang": 0,
        "java": 0,
        "kotlin": 0,
        "php": 0,
        "csharp": 0,
        "c#": 0,
        "swift": 0,
        "net": 0,
        "core": 0,
        "rb": 0,
        "ruby": 0,
        "c": 0,
        "c++": 0,
        "cpp": 0,
        "matlab": 0,
        "typescript": 0,
        "ts": 0,
        "scala": 0,
        "html": 0,
        "css": 0,
        "rust": 0,
        "rs": 0,
        "perl": 0
    }

    for token_series in tokens:
        for token in token_series:
            if token in most_used_programming_language:
                token_count[token] += 1

    df = pd.DataFrame.from_dict(token_count, orient='index')
    df.plot(kind='bar', color="#f56900", title='Top des langages de programmation les plus cités')

token_plot(data['Tokens'])

# StopWords

In [None]:
stop_words = set(stopwords.words('english'))

def delete_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]   
     
data['Tokens'] = data.apply(lambda x: delete_stopwords(x['Tokens']), axis=1)

data.head()

In [None]:
token_plot(data['Tokens'])

# Lemmatize

In [None]:
def lemmatize(token_series):
    lemmatizer = WordNetLemmatizer()
    tokens = []
    for token in token_series:
        tokens.append(lemmatizer.lemmatize(token))
    return tokens

data['TokensLem'] = data.apply(lambda x: lemmatize(x['Tokens']), axis=1)

In [None]:
data.head()

# Bag Of Word

In [None]:
def bag_of_word(x):
    cv = CountVectorizer(stop_words='english')
    bog = cv.fit_transform(x['Tokens'])
    return bog

def bag_of_word_lem(x):
    cv = CountVectorizer(stop_words='english')
    bog_lem = cv.fit_transform(x['TokensLem'])
    return bog_lem


data['BOG'] = data.apply(lambda x: bag_of_word(x), axis=1)
data['BogLem'] = data.apply(lambda x: bag_of_word_lem(x), axis=1)

# TF-IDT

In [None]:
tfidf = TfidfVectorizer()  

data['Tfidt'] = data.apply(lambda x: tfidf.fit_transform(x['TokensLem']), axis=1)

print(tfidf.get_feature_names())
data['Tokens'].head()

# World2Vec

In [None]:
#w2v_size=300
#w2v_window=5
#w2v_min_count=1
#w2v_epochs=100
#maxlen = len(data['Post']) # adapt to length of sentences
sentences = data['Post'].to_list()
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

In [None]:
#from tensorflow.python.distribute.multi_process_runner import multiprocessing
#print("Build & train Word2Vec model ...")
#w2v_model = gensim.models.Word2Vec(min_count=w2v_min_count, window=w2v_window, vector_size=w2v_size, seed=42, workers=1)
#workers = multiprocessing.cpu_count()
#w2v_model.build_vocab(sentences)
#w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_epochs)
#model_vectors = w2v_model.wv
#w2v_words = model_vectors.index_to_key
#print("Vocabulary size: %i" % len(w2v_words))
#print("Word2Vec trained")

In [None]:
#sim_words = w2v_model.wv.most_similar('cpp')

#print(sim_words)

In [None]:
#sim_words = w2v_model.wv.most_similar(positive=["gcc", "cpp"], negative="js")
#print(sim_words)

# Use

In [None]:
# get cosine similairty matrix
#def cos_sim(input_vectors):
#    similarity = cosine_similarity(input_vectors)
#    return similarity

# get topN similar sentences
#def get_top_similar(index, sentence_list, similarity_matrix, topN):
    # get the corresponding row in similarity matrix
#   similarity_row = np.array(similarity_matrix[index, :])
    # get the indices of top similar
#    indices = similarity_row.argsort()[-topN:][::-1]
#    return [sentence_list[i] for i in indices]

#embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

#sentences_embeddings = embed(data['Post'])

#similarity_matrix = cos_sim(np.array(sentences_embeddings))

#top_similar = get_top_similar(0, data['Post'], similarity_matrix, 3)

#for x in range(len(top_similar)):
#    print("----")
#    print(top_similar[x])

# Bert

In [None]:
#BERT_MODEL = "https://tfhub.dev/google/experts/bert/wiki_books/2"
#PREPROCESS_MODEL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
#import tensorflow_text
#preprocess = hub.load(PREPROCESS_MODEL)
#bert = hub.load(BERT_MODEL)
#bert_inputs = preprocess(data['Post'].head(150))

#bert_outputs = bert(bert_inputs, training=False)
#pooled_output = bert_outputs['pooled_output']
#sequence_output = bert_outputs['sequence_output']

#print('\nSentences:')
#print(data['Post'][0])
#print('\nPooled output:')
#print(pooled_output[0])
#print('\nSequence output:')
#print(sequence_output[0])



In [None]:
def tags_to_array(x):
    tags = []
    for tag in x['Tags'].split(" "):
        if tag != " " or tag != " ":
            tags.append(tag)
    return tags

data['Tags'] = data.apply(lambda x: tags_to_array(x), axis=1)
data.dropna(inplace=True, axis=1)
data.head()

# LDA

In [None]:
from sklearn.decomposition import TruncatedSVD

def make_pca(x):
    pca = TruncatedSVD(n_components=2)
    X_reduced_train = pca.fit_transform(x['Tfidt'])
    return X_reduced_train

data['Tfidt_PCA'] = data.apply(lambda x: make_pca(x), axis=1)

In [None]:
from matplotlib import pyplot as plt
from gensim.models import TfidfModel
from gensim import corpora, models
from gensim.models import CoherenceModel

def make_lda(posts, num_topic):
    dictionary = corpora.Dictionary(posts)
    dictionary.filter_extremes(no_below=1000)
    bow_corpus = [dictionary.doc2bow(text) for text in posts]
    tfidf = TfidfModel(bow_corpus)
    tfidf_corpus = [tfidf[text] for text in bow_corpus]

    ldamodel = gensim.models.ldamulticore.LdaMulticore(tfidf_corpus, num_topics=num_topic, id2word = dictionary, passes=20)
    coherencemodel = CoherenceModel(model=ldamodel, texts=posts, dictionary=dictionary, coherence='c_v')
    return coherencemodel.get_coherence()


score_lda = []
for i in range(1, 51):
    score_lda.append(make_lda(sentences, i))

print(score_lda)

In [None]:
limit=52; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, score_lda)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
dictionary = corpora.Dictionary(sentences)
dictionary.filter_extremes(no_below=1000)
bow_corpus = [dictionary.doc2bow(text) for text in sentences]
tfidf = TfidfModel(bow_corpus)
tfidf_corpus = [tfidf[text] for text in bow_corpus]

ldamodel = gensim.models.ldamulticore.LdaMulticore(tfidf_corpus, num_topics=7, id2word = dictionary, passes=20)

def predict_unsupervised_tags(text):
    corpus_new = dictionary.doc2bow(text)
    topics = ldamodel.get_document_topics(corpus_new)

    relevant_topic = topics[0][0]
    relevant_topic_prob = topics[0][1]

    for i in range(len(topics)):
        if topics[i][1] > relevant_topic_prob:
            relevant_topic = topics[i][0]
            relevant_topic_prob = topics[i][1]

    potential_tags = ldamodel.get_topic_terms(topicid=relevant_topic, topn=20)

    relevant_tags = [dictionary[tag[0]] for tag in potential_tags if dictionary[tag[0]] in text]

    return relevant_tags

sentences = data['Posts']
sentences = [gensim.utils.simple_preprocess(text) for text in sentences]

print(predict_unsupervised_tags(sentences))