In [1]:
%reload_ext autoreload
%autoreload 2

## Imports

In [2]:
from fonte_dados.fabrica import FabricaFonteDados
from repository.resultado import ResultadoRepo
from treinamento.treinamento_lda import TreinamentoLda
from util.constants import NERDS_VIAJANTES

import pandas as pd

## Carregamento de dados para treinamento

In [3]:
resultado_repo_wp_full = ResultadoRepo(collection_name='resultados-wp-full')
resultado_repo_wp_get42 = ResultadoRepo(collection_name='resultados')
fabrica = FabricaFonteDados()
num_topics = 57
passes = 2

In [4]:
fonte_dados = fabrica.get_fonte_dados(NERDS_VIAJANTES)
fonte_dados.carregar_dados()
documentos = fonte_dados.get_tokens()

## Executa treinamento de modelo

In [5]:
treinamento_lda = TreinamentoLda(num_topics=num_topics, passes=passes)
resultado_lda = treinamento_lda.ajustar_modelo(documentos, alpha=0.01, eta=0.01)
lda_model = resultado_lda.modelo_lda
corpus = resultado_lda.corpus

# lda_model.print_topics()

Ajustando modelo com 57 topicos e 2 passes


## Análise de tópicos

### Dominant topic and its percentage contribution in each document

In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output.

This way, you will know which document belongs predominantly to which topic.

In [6]:
def format_topics_sentences(ldamodel=None, corpus=None, texts=None):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return (sent_topics_df)

In [7]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=resultado_lda.corpus, texts=documentos)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,19.0,0.9949,"bonit, temp, cidad, vist, cervej, aind, pra, g...","[santiag, mus, art, pré, colombi, vár, mes, pe..."
1,1,6.0,0.9947,"congr, cidad, ambi, marin, noss, opç, cad, blo...","[santiag, bacan, dentr, opç, divers, blog, aqu..."
2,2,8.0,0.9924,"glaci, temp, pouc, lag, seward, vist, pais, ai...","[palaci, moned, soment, moned, chilen, santiag..."
3,3,55.0,0.4061,"câm, cidad, loj, estanh, cen, expos, são, peç,...","[hoj, seç, seman, câm, ent, últ, chil, argenti..."
4,4,30.0,0.9891,"parqu, ônibu, temp, montanh, expos, fiz, cent,...","[monument, valley, unid, dentr, indígen, trib,..."
5,5,1.0,0.9958,"sorvet, cidad, aind, histór, mund, uma, pouc, ...","[cristób, 880m, nível, 280m, nível, santiag, s..."
6,6,54.0,0.8293,"routeburn, track, hut, mus, pais, câm, porqu, ...","[frutill, cidad, bel, alemã, não, muse, alemán..."
7,7,4.0,0.9936,"argentin, prat, boa, bariloch, cord, cidad, ma...","[qu, país, boa, sempr, argentin, buen, air, de..."
8,8,17.0,0.9967,"lag, parqu, barc, águ, bonit, cacho, pouc, táb...","[lag, andin, petrohué, antecipad, lug, bellav,..."
9,9,8.0,0.9113,"glaci, temp, pouc, lag, seward, vist, pais, ai...","[catedr, metropolit, santiag, atr, religi, pop..."


### The most representative sentence for each topic

Sometimes you want to get samples of sentences that most represent a given topic. This code gets the most exemplar sentence for each topic.

In [16]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(20)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.9943,"urs, iso, boa, câm, parqu, cinderel, moment, voad, voo, cordilh","[selv, intenç, denal, nation, park, parqu, alasc, aind, parqu, animal, inclusiv, urs, espéci, gr..."
1,1.0,0.9974,"sorvet, cidad, aind, histór, mund, uma, pouc, pesso, são, temp","[boa, gulose, princip, sorvet, sempr, sorvet, bel, horizont, cidad, dentr, sorvet, aless, uma, i..."
2,2.0,0.7644,"alt, geral, caraç, cidad, santu, igrej, santiag, parqu, matriz, ônibu","[rapid, santiag, vall, estaçã, esqu, montanh, passe, princip, cidad, ônibu, hop, hop, off, final..."
3,3.0,0.9978,"prai, bonit, mari, cidad, igrej, pouc, praç, rua, vist, fiz","[noss, fern, noronh, tour, apes, bacan, poi, noronh, fim, agênc, caminhon, 4x4, depois, turist, ..."
4,4.0,0.9936,"argentin, prat, boa, bariloch, cord, cidad, mas, afinal, portugu, jauj","[qu, país, boa, sempr, argentin, buen, air, depois, bariloch, cidad, boa, bariloch, jauj, cardáp..."
5,5.0,0.9974,"lago, lençol, maranh, cervej, cidad, parqu, águ, aind, temp, chei","[lençol, maranh, antig, junh, 2012, promoç, aére, bom, são, luí, maranh, lençol, maranh, lago, c..."
6,6.0,0.9947,"congr, cidad, ambi, marin, noss, opç, cad, blog, sit, bel","[santiag, bacan, dentr, opç, divers, blog, aquí, blog, destemper, diss, mes, mes, varand, ótim, ..."
7,7.0,0.9984,"glaci, temp, quilômetr, pouc, lad, barc, chuv, lag, bonit, jalap","[sul, argentin, chil, contém, dezen, glaci, uma, boa, barc, lag, argentin, nest, glaci, upsal, s..."
8,8.0,0.9978,"glaci, temp, pouc, lag, seward, vist, pais, aind, montanh, bonit","[patagôn, calafat, dest, estânc, dentr, opç, cristin, apes, rebanh, ovelh, porqu, dentr, áre, pa..."
9,9.0,0.9951,"animal, temp, vist, selv, expos, macr, long, lad, lag, yellowston","[yellowston, nation, park, parqu, nacion, unid, selv, sempr, chanc, animal, talv, bich, princip,..."
