In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from gensim import corpora
from gensim import similarities
from string import punctuation
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
import spacy
# you need to run python -m spacy download en
import nltk
from nltk import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE 
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import warnings, sys, os
from pathlib import Path


path = Path(os.getcwd())
sys.path.append(str(path.parent))
warnings.filterwarnings('ignore')  
%matplotlib inline
#data_directory = '.'
data_directory = '../data'


  from collections import Mapping


In [2]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Estefi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
summary = pd.read_csv(f'{data_directory}/dreamers_summary.csv', sep='|')

In [4]:
dream = pd.read_csv(f'{data_directory}/dreams_clean.csv', sep=';')
# Borro aquellos sueños que no tienen palabras y aquellos en aleman que son los del grupo con id 18, 26 y 27
dream = dream.dropna(axis=0, subset=['words']).drop(dream.loc[dream['group_id'].isin([18, 26, 27, 79, 80])].index)


In [5]:
df = pd.merge(dream, summary, left_on='group_id', right_on='id')
df # Imprimo las primeras rows del df mergeado.

Unnamed: 0,code,note,description,words,group_id,group,dreamer sex,dreamer age,dream years,numbers of dreams,summary,id,total_words
0,1,1957,"The one at the Meads's house, where it's bigge...",154.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
1,2,8/11/67,I'm at a family reunion in a large fine house ...,248.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
2,3,8/1/85,I watch a plane fly past and shortly realize i...,303.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
3,4,1985?,Me pulling the green leaves and berries off so...,468.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
4,5,1985?,I'm in a room that reminds me of (but definite...,561.0,1,Alta: a detailed dreamer,female,adult,1985-1997,422,Alta is an adult woman who wrote down her drea...,1,166351.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36197,85,"F, age 18",The dream was about me and my boyfriend going ...,138.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36198,86,"F, age 18",Two weeks ago this guy asked me to Senior Ball...,96.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36199,87,"F, age 18",My boyfriend just broke up with me so he was o...,139.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0
36200,88,"F, age 18",I was in my backyard and I was flying. I would...,104.0,89,West Coast teenage girls,female,11 to 18,mid-1990s,89,"These dreams, from teenage girls ages 11-18, w...",89,9820.0


In [6]:
df_vietnam = df.loc[df['group'].isin(['Vietnam Vet: 1970-2008 war dreams', 'Vietnam Vet: 2015 dreams', 'Vietnam Vet: 2016-17 dreams'])]
df_phil = df.loc[df['group'].isin(['Phil 1: teens', 'Phil 2: late 20s', 'Phil 3: retirement'])]
df_pegasus = df.loc[df['group'].isin(['Pegasus: a factory worker'])]
df_norman = df.loc[df['group'].isin(['Norman: a child molester'])]

print(f"Se cuenta con {len(df_vietnam)} sueños de Vietnam. El corpus tiene {int(df_vietnam['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_phil)} sueños de Phil, nuestro conjunto de control. El corpus tiene {int(df_phil['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_pegasus)} sueños de Pegasus. El corpus tiene {int(df_pegasus['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_norman)} sueños de Norman, nuestro conjunto de control. El corpus tiene {int(df_norman['words'].sum())} palabras.")

Se cuenta con 593 sueños de Vietnam. El corpus tiene 80684 palabras.
Se cuenta con 506 sueños de Phil, nuestro conjunto de control. El corpus tiene 85162 palabras.
Se cuenta con 1093 sueños de Pegasus. El corpus tiene 135928 palabras.
Se cuenta con 1235 sueños de Norman, nuestro conjunto de control. El corpus tiene 51340 palabras.


In [7]:
data = df['description'].values.tolist() # armamos la lista de sueños.

In [8]:
#Solo palabras en minuscula sin signos de puntuación. 
#Se eliminan stopwords y los tokens con menos de 4 caracteres.
def sent_to_words(sentences):
    for sentence in sentences:
        yield([word for word in gensim.utils.simple_preprocess(str(sentence), deacc=True) if word not in stop_words and len(word) > 3])  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print("El corpus tiene ",len(data_words), " sueños y ",sum([len(x) for x in data_words])," tokens"   )

El corpus tiene  36202  sueños y  2278397  tokens


In [9]:
#Observo cuales son las palabras más frecuentes del corpus.
flat_list = [item for sublist in data_words for item in sublist]
df = pd.DataFrame(flat_list, columns=['words'])
df['count']=1
df=df.groupby(['words']).count()
df.sort_values(by="count",ascending=False).head(40)

Unnamed: 0_level_0,count
words,Unnamed: 1_level_1
like,31946
said,24258
went,18802
back,18726
going,18163
something,16874
people,16021
room,14034
around,12161
would,12138


In [10]:
#Bi-gramas y Tri-gramas con el scoring default y se analiza el threshold si es correcto.
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [11]:
#Analizo los bigramas que armo el modelo de acuerdo a los parametros scoring default y threshold 100.
df_bigramas =pd.DataFrame([x for x in bigram.export_phrases(data_words)],columns=["bigram","score"])
print("Cantidad de Bigramas con score_default ",df_bigramas.shape)
df_bigramas.drop_duplicates(inplace=True)
print("Cantidad de Bigramas sin duplicados con score_default ",df_bigramas.shape)
df_bigramas.sort_values(by="score",ascending=True).head(10)

Cantidad de Bigramas con score_default  (40297, 2)
Cantidad de Bigramas sin duplicados con score_default  (2094, 2)


Unnamed: 0,bigram,score
31142,b'melvin rich',100.281757
130,b'stay overnight',100.303272
30876,b'slope avenue',100.475725
170,b'bright sunny',100.597337
24235,b'mixer board',100.604456
2806,b'play clarinet',100.905109
1059,b'furry animal',101.013029
1428,b'sexual activity',101.160565
24511,b'lights sirens',101.357951
12202,b'killed giovanni',101.437122


In [12]:
# Analizo los scores obtenidos y vemos que el maximos es 51mil y el minimo 100. Hay que ajustar este threshold.
df_bigramas['score'].describe()

count     2094.000000
mean      2572.949817
std       6111.878233
min        100.281757
25%        179.852715
50%        421.704413
75%       1597.172312
max      51516.644628
Name: score, dtype: float64

In [13]:
#Analizo los trigramas que armo el modelo de acuerdo a los parametros scoring default y threshold 100.
#al pedir construir trigramas en base a los bigramas se ve que hay casos en donde hay cuatrogrmaas.Ej dawson_creek_dawson_joey
df_trigramas =pd.DataFrame([x for x in trigram.export_phrases(data_words)],columns=["trigram","score"])
print("Cantidad de Triigramas con score_default ",df_trigramas.shape)
df_trigramas.drop_duplicates(inplace=True)
print("Cantidad de Triigramas sin duplicados con score_default ",df_trigramas.shape)
df_trigramas.sort_values(by="score",ascending=False).head(10)

Cantidad de Triigramas con score_default  (38595, 2)
Cantidad de Triigramas sin duplicados con score_default  (1931, 2)


Unnamed: 0,trigram,score
7007,b'monty python',9695998.0
11896,b'malcolm mcdowell',2751567.0
10181,b'alma mater',2620540.0
11031,b'colin farrell',2499592.0
11338,b'angelina jolie',2096432.0
29015,b'emerald reinhold',1191155.0
14099,b'morena baccarin',1113730.0
1360,b'visually impaired',1048216.0
315,b'cocker spaniel',1048216.0
11913,b'quentin tarantino',1048216.0


In [14]:
# Analizo los scores obtenidos y vemos que el maximos es 9millones y el minimo 100. Hay que ajustar este threshold.
df_trigramas['score'].describe()

count    1.931000e+03
mean     2.713222e+04
std      2.636762e+05
min      1.000559e+02
25%      2.264458e+02
50%      6.019132e+02
75%      2.772273e+03
max      9.695998e+06
Name: score, dtype: float64

In [15]:
#Bi-gramas y Tri-gramas con el scoring npmi y se analiza el threshold si es correcto.
bigram = gensim.models.Phrases(data_words, min_count=10, threshold=0.5, scoring='npmi') # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], min_count=10, threshold=0.5, scoring='npmi')   
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [16]:
#Analizo los bigramas que armo el modelo de acuerdo a los parametros scoring npmi y threshold 0.5.
df_bigramas =pd.DataFrame([x for x in bigram.export_phrases(data_words)],columns=["bigram","score"])
print("Cantidad de Bigramas con score npmi ", df_bigramas.shape)
df_bigramas.drop_duplicates(inplace=True)
print("Cantidad de Bigramas sin duplicados con score npmi ", df_bigramas.shape)
df_bigramas.sort_values(by="score",ascending=True).head(10)

Cantidad de Bigramas con score npmi  (43182, 2)
Cantidad de Bigramas sin duplicados con score npmi  (1126, 2)


Unnamed: 0,bigram,score
5150,b'freshman year',0.500001
2884,b'telling truth',0.500155
11153,b'snake bite',0.500165
14154,b'ring tone',0.500362
8524,b'kissed lips',0.500423
8813,b'average size',0.50055
327,b'flesh blood',0.500608
5896,b'perplexed participant',0.500888
12239,b'police sirens',0.500933
7873,b'banquet hall',0.501371


In [17]:
# Analizo los scores obtenidos para score npmi
df_bigramas['score'].describe()

count    1126.000000
mean        0.665482
std         0.131949
min         0.500001
25%         0.554049
50%         0.631904
75%         0.751093
max         1.000000
Name: score, dtype: float64

In [18]:
#Analizo los trigramas que armo el modelo de acuerdo a los parametros scoring npmi y threshold 0.5.
#al pedir construir trigramas en base a los bigramas se ve que hay casos en donde hay cuatrogrmaas.Ej dawson_creek_dawson_joey
df_trigramas =pd.DataFrame([x for x in trigram.export_phrases(data_words)],columns=["trigram","score"])
print("Cantidad de Trigramas con score npmi ",df_trigramas.shape)
df_trigramas.drop_duplicates(inplace=True)
print("Cantidad de Trigramas sin duplicados con score npmi ",df_trigramas.shape)
df_trigramas.sort_values(by="score",ascending=False).head(10)

Cantidad de Trigramas con score npmi  (42925, 2)
Cantidad de Trigramas sin duplicados con score npmi  (1059, 2)


Unnamed: 0,trigram,score
8279,b'monty python',1.649677
13287,b'queer folk',1.409812
14688,b'jensen ackles',1.403078
14187,b'malcolm mcdowell',1.390499
14672,b'jared padalecki',1.388598
12220,b'alma mater',1.382275
915,b'roller coaster',1.362195
13284,b'colin farrell',1.341222
32029,b'emerald reinhold',1.33082
457,b'johnny depp',1.329881


In [19]:
# Analizo los scores obtenidos para score npmi
df_trigramas['score'].describe()

count    1059.000000
mean        0.717405
std         0.187597
min         0.500085
25%         0.572983
50%         0.666123
75%         0.801927
max         1.649677
Name: score, dtype: float64

In [33]:
# Reviso los bigramas formados para el score npmi.
data_grams=[bigram_mod[doc] for doc in data_words]
result = []
for dream_grams in data_grams:
    for word in dream_grams:
        if re.findall(r".*_.*", word):
            result.append(word)
result=list(set(result))
result.sort()
result

['a___',
 'a____',
 'a_____',
 'a______',
 'able_bodied',
 'aboard_ship',
 'accounting_exam',
 'accounting_test',
 'across_street',
 'acting_weird',
 'actual_participant',
 'adam_baldwin',
 'adam_sandler',
 'adrian_edmondson',
 'advisor_mary',
 'afraid_heights',
 'african_american',
 'aidan_gillen',
 'alain_delon',
 'alarm_clock',
 'alarm_system',
 'alexander_skarsgard',
 'alice_wonderland',
 'alma_mater',
 'alphabetical_order',
 'alternate_universe',
 'alvin_broud',
 'always_sunny',
 'amb_______',
 'american_flag',
 'american_idol',
 'amusement_park',
 'and____came',
 'andrea_bocelli',
 'andrew_mccarthy',
 'angelina_jolie',
 'animal_crossing',
 'annie_thomas',
 'answering_machine',
 'answers_questions',
 'anxious_perplexed',
 'apartment_complex',
 'apollo____',
 'archie_weber',
 'armstrong_high',
 'around____',
 'arrested_development',
 'artificial_respiration',
 'asking_questions',
 'attract_attention',
 'aunt_bridget',
 'aunt_charlotte',
 'aunt_christine',
 'aunt_elaine',
 'aunt_jan

In [32]:
# Reviso los trigramas formados para el score npmi.
data_grams=[trigram_mod[bigram_mod[doc]] for doc in data_words]
result = []
for dream_grams in data_grams:
    for word in dream_grams:
        if re.findall(r".*_.*_.*", word):
            result.append(word)
result=list(set(result))
result.sort()
result

['a___',
 'a____',
 'a_____',
 'a______',
 'actual_participant_neither',
 'actual_participant_neither_pleasant',
 'actual_participant_pleasant',
 'actual_participant_unpleasant',
 'advisor_mary_monroe',
 'always_sunny_philadelphia',
 'amb_______',
 'and____came',
 'answers_questions_anxious',
 'answers_questions_anxious_perplexed',
 'answers_questions_perplexed',
 'anxious_actual_participant',
 'apollo____',
 'armstrong_high_school',
 'around____',
 'aunt_christine_uncle_larry',
 'aunt_janice_uncle_larry',
 'away____',
 'b___',
 'b____',
 'b_____',
 'b______',
 'b_______',
 'b___________',
 'ba____',
 'bancroft_middle_school',
 'beau____',
 'bellick_prison_break',
 'bl___',
 'blah_blah_blah',
 'blah_blah_blah_blah',
 'bob______',
 'boss_mike_hollow',
 'brookland_high_school',
 'buffy_vampire_slayer',
 'buster_arrested_development',
 'c___',
 'c____',
 'c_____',
 'c__________',
 'c_______s',
 'c_____s',
 'ca__',
 'called_______',
 'calvin_silas_samantha',
 'can___something',
 'car_____'

In [22]:
bad_ids=['like','say','remember','dream','think','know','could','go','would','want','tell','thing','start','come','back','look','people','ask','seem','talk','make','take']

In [23]:
# Caso  ['roller_skate'], ['roller_skates'],
# Deberia hacerse lemantizacion sobre los tokens y luego hacer los bigramas. Habria que eliminar duplicados.
def process_words(texts, stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):

    """Bigramas"""    
    texts = [bigram_mod[doc] for doc in texts]
    
    """Trigramas"""   
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    """Lemmatization"""
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    # remove stopwords once more after lemmatization y quito bad_ids
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words and word not in bad_ids and len(word) > 3] for doc in texts_out]    

    return texts_out

data_ready = process_words(data_words, stop_words)  # processed Text Data!

In [24]:
print("El corpus tiene ",len(data_ready), " sueños y ",sum([len(x) for x in data_ready])," tokens,bigramas,lemas"   )

El corpus tiene  36202  sueños y  1432928  tokens,bigramas,lemas


In [34]:
#Analizar los bigramas de mi corpus. Los que queda luego de aplicar el modelo.
result = []
for dream_grams in data_ready:
    for word in dream_grams:
        if re.findall(r".*_.*", word):
            result.append(word)
result.sort()
result=list(set(result))
print("Cant de Bigrmas sin dupliar ",len(result))
result

Cant de Bigrmas sin dupliar  643


['three_four',
 'body_snatcher',
 'cruise_ship',
 'kissed_lip',
 'cary_grant',
 'brother_stuart',
 'gray_haire',
 'midvale_shop',
 'police_officer',
 'chocolate_chip',
 'flights_stairs',
 'storm_brewe',
 'outer_limit',
 'remote_control',
 'lord_ring',
 'tigger_mitten',
 'crhp_sister',
 'chow_chow',
 'british_accent',
 'puppies_kitten',
 'orange_juice',
 'short_film',
 'sexually_arouse',
 'bowel_movement',
 'golf_cart',
 'facial_hair',
 'feel_queasy',
 'game_throne',
 'peavey_system',
 'shook_hand',
 'climb_ladder',
 'single_file',
 'freshly_painted',
 'light_skinned',
 'chevy_chase',
 'exactly_alike',
 'bradley_cooper',
 'felt_guilty',
 'grand_piano',
 'radio_station',
 'summer_camp',
 'dollar_bill',
 'ping_pong',
 'guinea_pigs',
 'friend_mine',
 'tall_slender',
 'average_size',
 'twin_bed',
 'hundred_dollar',
 'aunt_polly',
 'locked_door',
 'hardly_breathe',
 'stuffed_animal',
 'dark_haire',
 'bunny_cage',
 'lifting_weight',
 'riding_tandem',
 'kiss_cheek',
 'thank_goodness',
 'retrac

In [35]:
#Analizar los trigramas de mi corpus. Los que queda luego de aplicar el modelo.
result = []
for dream_grams in data_ready:
    for word in dream_grams:
        if re.findall(r".*_.*_.*", word):
            result.append(word)
result.sort()
result=list(set(result))
print("Cant de Trigrmas sin dupliar ",len(result))
result

Cant de Trigrmas sin dupliar  8


['friend_matt_yam',
 'can___something',
 'c_____s',
 'dusk_till_dawn',
 'and____came',
 'c_______s',
 'i___he',
 'flew_cuckoo_n']

In [36]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
#Nos quedamos con los tokens que aparecen en al menos 10 documentos y que no aparezcan en mas del 50% de mi corpus.
id2word.filter_extremes(no_below=10, no_above=0.5)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

In [37]:
print("#de palabras en el diccionario",len(id2word.values()))
print([item for item in id2word.items()][:35])
print("#ids en el  diccionario en todos los sueños",sum([len(x) for x in corpus]))
print([item for item in corpus][:1])
print([item for item in data_ready][:1])

#de palabras en el diccionario 7012
[(0, 'almost'), (1, 'arrive'), (2, 'aunt'), (3, 'balloon'), (4, 'blue'), (5, 'bridge'), (6, 'convertible'), (7, 'cooking'), (8, 'corner'), (9, 'couple'), (10, 'creek'), (11, 'cross'), (12, 'double'), (13, 'drive'), (14, 'drop'), (15, 'empty'), (16, 'european'), (17, 'fill'), (18, 'find'), (19, 'hallway'), (20, 'house'), (21, 'immediately'), (22, 'little'), (23, 'nice'), (24, 'outside'), (25, 'pick'), (26, 'play'), (27, 'room'), (28, 'round'), (29, 'second'), (30, 'short'), (31, 'side'), (32, 'sort'), (33, 'stre'), (34, 'tiny')]
#ids en el  diccionario en todos los sueños 1106393
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 2), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]]
[['european', 'piper', 'sort', 'ho

In [38]:
lda_model = gensim.models.ldamulticore.LdaMulticore(workers=2,
                                                corpus=corpus,
                                                id2word=id2word,
                                                num_topics=100, 
                                                random_state=100,
                                                #update_every=1, default 1
                                                #chunksize=10, default 2000
                                                passes=10,
                                                iterations = 2000,
                                                #alpha='auto',
                                                per_word_topics=True)


In [39]:
topics = lda_model.print_topics(100)
for topic in topics:
    print(topic)

(0, '0.156*"store" + 0.035*"sell" + 0.031*"counter" + 0.028*"work" + 0.020*"owner" + 0.018*"manager" + 0.017*"mall" + 0.016*"sale" + 0.015*"walk" + 0.015*"rack"')
(1, '0.065*"naked" + 0.053*"dorm" + 0.035*"roommate" + 0.030*"trip" + 0.026*"balloon" + 0.021*"public" + 0.021*"brush" + 0.020*"bleacher" + 0.020*"government" + 0.017*"holiday"')
(2, '0.316*"school" + 0.040*"animal" + 0.028*"tooth" + 0.027*"place" + 0.022*"leave" + 0.018*"teacher" + 0.018*"year" + 0.017*"give" + 0.017*"chip" + 0.017*"stuff"')
(3, '0.123*"plane" + 0.056*"land" + 0.048*"film" + 0.044*"crash" + 0.039*"airport" + 0.030*"guard" + 0.025*"flight" + 0.025*"photograph" + 0.024*"crush" + 0.024*"priest"')
(4, '0.045*"heart" + 0.034*"today" + 0.030*"rate" + 0.025*"sunny" + 0.024*"tank" + 0.022*"pound" + 0.020*"sword" + 0.017*"respect" + 0.016*"weak" + 0.015*"second"')
(5, '0.142*"doctor" + 0.054*"angry" + 0.050*"reply" + 0.045*"charge" + 0.027*"chain" + 0.026*"indian" + 0.022*"cafe" + 0.021*"roll" + 0.020*"shock" + 0.019

In [40]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [41]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

    # Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.to_csv(f"{data_directory}/topic_example_100_analysis.csv", sep=";", index=False)


In [42]:
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,77.0,0.2747,"room, floor, apartment, wall, move, large, wal...","[european, piper, sort, house, side, second, i..."
1,1,8.0,0.2133,"drive, stop, road, leave, turn, driver, park, ...","[large, fine, house, ground, maybe, rent, publ..."
2,2,81.0,0.1084,"time, event, involve, work, also, first, feel,...","[watch, plane, shortly, realize, crash, half, ..."
3,3,50.0,0.2574,"sort, little, rather, place, quite, right, muc...","[pull, green, leave, berry, branch, live, grow..."
4,4,50.0,0.2701,"sort, little, rather, place, quite, right, muc...","[room, remind, definitely, street, live, least..."
5,5,77.0,0.2425,"room, floor, apartment, wall, move, large, wal...","[live, next, door, apartment, move, move, plac..."
6,6,45.0,0.2243,"time, leave, realize, decide, maybe, bill, als...","[kidnap, stop, visit, house, building, togethe..."
7,7,20.0,0.2524,"knife, text, stab, asian, blind, count, word, ...","[alone, apartment, place, build, dark, maybe, ..."
8,8,50.0,0.2799,"sort, little, rather, place, quite, right, muc...","[somewhere, friend, around, much, house, elsew..."
9,9,81.0,0.2898,"time, event, involve, work, also, first, feel,...","[public, place, remind, grocery_store, airport..."


In [43]:
df_dominant_topic['Dominant_Topic'].unique()

array([77.,  8., 81., 50., 45., 20., 35., 91., 89., 11., 56., 75., 70.,
       54., 19., 25., 37.,  2., 32., 68., 60., 39., 84., 46., 12., 28.,
       55., 96., 85., 15., 66., 95., 22., 48., 78., 23., 43., 88., 64.,
       99., 98., 47., 17., 40., 51., 61., 74., 16., 94., 33., 82., 63.,
       26.,  0., 30.,  3., 24., 71., 34.,  7., 44., 92., 79., 58., 76.,
       72., 41., 90.,  9., 80., 52., 69., 10., 36., 13., 86., 97., 18.,
        6., 73., 53., 87., 83., 67., 27., 31., 57., 38., 93., 65., 59.,
       21., 49.,  5., 62.,  1.,  4., 42., 29., 14., nan])

In [44]:
tmp=df_dominant_topic['Dominant_Topic'].value_counts()
df=pd.DataFrame(tmp)
pd.set_option('display.max_rows', df.shape[0]+1)
print("#sueños asignados a topicos: ",df['Dominant_Topic'].sum())
print(df)

#sueños asignados a topicos:  36009
      Dominant_Topic
35.0            1845
77.0            1697
45.0            1363
68.0            1212
8.0             1170
66.0            1095
55.0            1041
32.0             960
56.0             839
84.0             829
50.0             824
39.0             808
46.0             750
2.0              704
15.0             693
75.0             681
25.0             640
24.0             582
51.0             539
64.0             537
21.0             533
54.0             510
76.0             492
74.0             489
97.0             481
17.0             472
85.0             464
78.0             460
89.0             427
41.0             416
81.0             404
11.0             382
60.0             343
0.0              310
43.0             309
83.0             306
82.0             299
18.0             292
28.0             284
19.0             283
16.0             278
71.0             274
91.0             268
40.0             266
99.0             25