In [None]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction
!pip install gensim
!pip install pyLDAvis
!pip install nltk
import numpy as np
import json
import glob
import pandas
pandas.set_option('display.max_colwidth', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer



#Gensim
import gensim
import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser


#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
def add_hours(minDate:str,maxDate:str)-> (str,str):
	minDate = minDate + "00:00:00"
	maxDate = maxDate + "23:59:59"

	return minDate,maxDate

def get_comments(originPath:str,minDate:str = None,maxDate:str = None,versions = None) -> pandas.DataFrame:
	
	commentsFile = pandas.read_csv(originPath)

	if minDate != None and maxDate != None:
		minDate,maxDate = add_hours(minDate,maxDate)

	for i in range(0, len(commentsFile)):
		if commentsFile['date'][i] < minDate or commentsFile['date'][i] > maxDate or commentsFile['version'][i] not in versions:
			#print("droped",commentsFile['version'][i])
			commentsFile = commentsFile.drop([i])
	return commentsFile

def lemmatization(texts, allowed_postags):
    nlp = spacy.load("pt_core_news_sm")
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)

def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

In [None]:
database = get_comments("./General_Data.csv","2021-01-01","2021-10-20","3.0.0")
data = database['content']

In [None]:
score1 = database[database['score'] == 1]
data = score1['content'] # selectionando apenas os comentários com nota = 1

In [None]:
lemmatized_texts = lemmatization(data, ["NOUN", "VERB"])
print (lemmatized_texts[0])

In [None]:
data_words = gen_words(lemmatized_texts)
print (data_words[0])

# Extraindo tópicos a partir de palavras unitárias (1-gram)

In [None]:
id2word = corpora.Dictionary(data_words)

corpus = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus.append(new)

print (corpus[0])

word = id2word[[7][:1][0]]
print (word)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=1000,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=100)
vis

# Extraindo tópicos em n-grams

In [None]:
N = 3 #qtde de ngrams
print(data_words[0])
ngrams = []
for sentence in data_words:
    ngrams_temp = []
    tokens = list(sentence)
    for i in range(0,len(tokens)-(N-1)):
        new_text = tokens[i]
        for k in range(1,N):
            new_text = new_text + '_' + tokens[i+k]
        ngrams_temp.append(new_text)
    ngrams.append(ngrams_temp)

In [None]:
ngrams[0]

In [None]:
id2ngram = corpora.Dictionary(ngrams)

In [None]:
corpus_ngram = []
for text in ngrams:
    new = id2ngram.doc2bow(text)
    corpus_ngram.append(new)

print (corpus_ngram[0])

In [None]:
lda_model_ngram = gensim.models.ldamodel.LdaModel(corpus=corpus_ngram,
                                           id2word=id2ngram,
                                           num_topics=10,
                                           random_state=1000,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_ngram, corpus_ngram, id2ngram, mds="mmds", R=100)
vis