# Modelagem de Tópicos - Parte II

## Exemplo 1 - LDA básico
    
#### 1 - Limpeza dos dados

In [None]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

#nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

import random
text_data = []
with open('./CSV/comments.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .8:
            print(tokens)
            text_data.append(tokens)

#### 2 - Preparando os dados para o Gensim

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
corpus

#### 3 - Solicitamos para separar em 3 tópicos

In [None]:
import gensim
NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

#### 4 - Classificando um novo documento

In [None]:
new_doc = 'The database stopped working'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

#### 5 - Visualização LDA

In [None]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

## Exemplo 2 - LDA & LSI

#### 6 - Carregando bibliotecas e dados

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np

import pandas as pd

data = pd.read_csv('./CSV/abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]
data_text['index'] = data_text.index
documents = data_text

documents

#### 7 - Limpeza dos dados

In [None]:
def lemmatize_stemming(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


doc_sample = documents[documents['index'] == 10].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

In [None]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

#### 8 - Criando um dicionário

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

#### 9 - Filtrando o dicionário

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#### 10 - Mala de palavras

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

#### 11 - Executar o LDA usando a mala de palavras

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=6)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

#### 12 - TF-IDF

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

#### 13 - Executar o LSI com TF-IDF

In [None]:
lda_model_tfidf = gensim.models.LsiModel(corpus_tfidf, num_topics=10, id2word=dictionary)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

#### 14 - Classificando uma amostra

In [None]:
for index, score in sorted(lda_model_tfidf[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

## Exemplo 3 - Sklearn

#### 15 - Carregando corpus

In [None]:
from nltk.corpus import brown
 
data = []
 
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
 
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

#### 16 - Preparando a mala de palavras

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
  
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(data)

#### 17 - Executando modelagem de tópicos com NMF

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

 
NUM_TOPICS = 10
 
# Build a Latent Dirichlet Allocation Model
#lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
#lda_Z = lda_model.fit_transform(data_vectorized)
#print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
#lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
#lsi_Z = lsi_model.fit_transform(data_vectorized)
#print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

In [None]:
nmf_Z[0]

#### 18 - Inspecionando tópicos NMF

In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

#### 19 - Classificando um documento não visto

In [None]:
text = "The economy is working better than ever"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)