In [None]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
from gensim import corpora
from gensim import similarities
from string import punctuation
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, nmf, LdaMulticore
import spacy
# you need to run python -m spacy download en
import nltk
from nltk import RegexpTokenizer, word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import pyLDAvis.gensim
from wordcloud import WordCloud
pyLDAvis.enable_notebook()
import warnings, sys, traceback, itertools, collections, logging
from time import time
%matplotlib inline


warnings.filterwarnings('ignore')

#pd.set_option('display.max_colwidth', -1)

In [None]:
print("Start the topic modelling discover ...")
t_total = time()

Configuraciones del logging

In [None]:
FORMAT = '%(asctime)-15s - %(filename)s:%(lineno)s - %(funcName)20s() \n%(message)s'
logging.basicConfig(filename='./topic_modelling.log', filemode='w', level=logging.ERROR, format=FORMAT)
logger = logging.getLogger(__name__)

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
bad_ids=['like','say','remember','dream','think','know','could','go','would','want','tell','thing','start','come','back','look','people','ask','seem','talk','make','take', 'recall']

Configuraciones generales

In [None]:
data_directory = '../data'
from_file = False
experiment = 11
number_of_topics_list = [90, 95, 100]

In [None]:
summary = pd.read_csv(f'{data_directory}/dreamers_summary.csv', sep='|')
dream = pd.read_csv(f'{data_directory}/dreams_clean.csv', sep=';')
# Borro aquellos sueños que no tienen palabras y aquellos en aleman que son los del grupo con id 18, 26 y 27
dream = dream.dropna(axis=0, subset=['words']).drop(dream.loc[dream['group_id'].isin([18, 26, 27, 79, 80])].index)


In [None]:
df = pd.merge(dream, summary, left_on='group_id', right_on='id')

In [None]:
data = df['description'].values.tolist()

In [None]:
def sent_to_words(sentences, word_min_len=2):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True, min_len=word_min_len))  # deacc=True removes punctuations

In [None]:
def process_words(texts, stop_words, bigram_mod, trigram_mod, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'], word_min_len=2, bad_ids = []):
    """Remove Stopwords, Form Bigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc), min_len=word_min_len) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    #texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), min_len=word_min_len) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [None]:
def format_topics_sentences(model, corpus, texts, n_words):
    dominant_topics_df = pd.DataFrame()
    for topic_distribution in model[corpus]:
        sorted_topics = sorted(topic_distribution[0], key=lambda x: (x[1]), reverse=True)
        try:
            topic_number, topic_prob = sorted_topics[0]
            topic_first_n_words = ", ".join([word for word, prob in model.show_topic(topic_number, topn=n_words)])
            dominant_topics_df = dominant_topics_df.append(pd.Series([topic_number, topic_prob, topic_first_n_words]), ignore_index=True)
        except:
            dominant_topics_df = dominant_topics_df.append(pd.Series([None, None, None]), ignore_index=True)
            logger.error('%s', traceback.format_exc())

    topics_df = pd.concat([dominant_topics_df, pd.Series(texts)], axis=1)
    topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Original_Text']
    
    return(topics_df)

In [None]:
print("Yielding the words ...")
t0 = time()
data_words = list(sent_to_words(data))
print(f"... done in {time() - t0}s.")

In [None]:
print("Generating bigram and trigram ...")
t0 = time()
bigram = gensim.models.Phrases(data_words, min_count=10, threshold=0.5, scoring='npmi') # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
print(f"... done in {time() - t0}s.")

In [None]:
print("Pre processing words: remove stopwords, form Bigrams and Lemmatization ...")
t0 = time()
data_ready = process_words(data_words, stop_words, bigram_mod, None, word_min_len=3, bad_ids=bad_ids)
print(f"... done in {time() - t0}s.")

In [None]:
print("Creating dictionary and corpus ...")
t0 = time()
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
print(f"Length of the dictionary is {len(id2word)}")

print("Filtering the extremes no_below=10, no_above=0.5 ...")
id2word.filter_extremes(no_below=10, no_above=0.5)
print(f"Length of the filter dictionary is {len(id2word)}")

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
print(f"... done in {time() - t0}s.")

In [None]:
if not from_file:
    print("Applying LDA topic modelling ...")
    t0 = time()
    lda_model_dict = {}
    for num_topics in number_of_topics_list:
        print(f"Looking for {num_topics} topics ...")
        t1 = time()
        lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                    workers=5,
                                                    id2word=id2word,
                                                    num_topics=num_topics, 
                                                    random_state=100,
                                                    chunksize=1000,
                                                    iterations=1000,
                                                    passes=10,
                                                    per_word_topics=True)
        lda_model_dict[num_topics] = lda_model
        print(f"... done in {time() - t1}s for {num_topics} topics.")

    print(f"... all done in {time() - t0}s.")

    for num_topics, model in lda_model_dict.items():
        model.save(f"{data_directory}/models/lda_topics_{num_topics}_exp_{experiment}")

In [None]:
if from_file:
    num_topics = 100
    experiment = 3
    lda_model_dict = {}
    lda_model_dict[num_topics] = LdaModel.load(f"{data_directory}/models/lda_topics_{num_topics}_exp_{experiment}")
    print("Load succesfully")

In [None]:
print("Calculating the distribution between topics and dreams ...")
t0 = time()
for num_topics, lda_model in lda_model_dict.items():
    t1 = time()
    print(f"Formating topics, corpus and saving them for lda model with {num_topics} topics ...")
    df_topic_sents_keywords = format_topics_sentences(model=lda_model, corpus=corpus, texts=data, n_words=10)
    df_topic_sents_keywords.to_csv(f"{data_directory}/lda_topic_example_{num_topics}_exp_{experiment}.csv", sep=";", index=False)
    print(f"... done in {time() - t1}s.")
print(f"... all done in {time() - t0}s.")

In [None]:
print("Calculating the coherence values ...")
t0 = time()
for num_topics, model in lda_model_dict.items():
    topics = [[word for word, prob in topic] for topicid, topic in model.show_topics(num_topics, formatted=False)]
    cm = CoherenceModel(topics=topics, texts=data_ready, dictionary=id2word, coherence='c_v',topn=10)
    #print(f"Coherende per topic {cm.get_coherence_per_topic()}")
    print(f"Coherence total for {num_topics} topics {cm.get_coherence()}")
print(f"... all done in {time() - t0}s.")

In [None]:
print(f"... all the work was done in {time() - t_total}s.")

In [None]:
long_string = ','.join(data_ready[2])# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')# Generate a word cloud
wordcloud.generate(long_string)# Visualize the word cloud
wordcloud.to_image()

In [None]:
hb

In [None]:
model_temp = LdaModel.load(f"{data_directory}/models/lda_topics_30_exp_6")
topics = [[word for word, prob in topic] for topicid, topic in model_temp.show_topics(30, formatted=False)]
cm = CoherenceModel(topics=topics, texts=data_ready, dictionary=id2word, coherence='c_v',topn=10)
print(f"Coherence total for {num_topics} topics {cm.get_coherence()}")

## Vietnam y Phil

In [None]:
df_vietnam = df.loc[df['group'].isin(['Vietnam Vet: 1970-2008 war dreams', 'Vietnam Vet: 2015 dreams', 'Vietnam Vet: 2016-17 dreams'])]
df_phil = df.loc[df['group'].isin(['Phil 1: teens', 'Phil 2: late 20s', 'Phil 3: retirement'])]
df_pegasus = df.loc[df['group'].isin(['Pegasus: a factory worker'])]
df_norman = df.loc[df['group'].isin(['Norman: a child molester'])]

print(f"Se cuenta con {len(df_vietnam)} sueños de Vietnam. El corpus tiene {int(df_vietnam['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_phil)} sueños de Phil, nuestro conjunto de control. El corpus tiene {int(df_phil['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_pegasus)} sueños de Pegasus. El corpus tiene {int(df_pegasus['words'].sum())} palabras.")
print(f"Se cuenta con {len(df_norman)} sueños de Norman, nuestro conjunto de control. El corpus tiene {int(df_norman['words'].sum())} palabras.")

### NMF solo con hasta 15 topicos

In [None]:
number_of_topics_list_nmf = [10,15]
print("Applying NMF topic modelling ...")
t0 = time()
nmf_model_dict = {}
for num_topics in number_of_topics_list_nmf:
    print(f"Looking for {num_topics} topics ...")
    t1 = time()
    nmf_model = gensim.models.nmf.Nmf(corpus=corpus,
                                id2word=id2word,
                                num_topics=num_topics, 
                                random_state=100,
                                eval_every=5,
                                chunksize=10,
                                passes=10,
                                kappa=0.1)
    nmf_model_dict[num_topics] = nmf_model
    print(f"... done in {time() - t1}s for {num_topics} topics.")

print(f"... all done in {time() - t0}s.")

In [None]:
list_of_topics = []
for topic_distribution in nmf_model_dict[10][corpus[:]]:
    sorted_topics = sorted(topic_distribution, key=lambda x: (x[1]), reverse=True)
    try:
        list_of_topics.append(sorted_topics[0][0])
    except:
        list_of_topics.append(-1)

topics_df = pd.DataFrame(list_of_topics, columns=['topic_number'])
print("--------------------------------------------")
topics_df['topic_number'].value_counts()

In [None]:
topics = [[word for word, prob in topic] for topicid, topic in nmf.show_topics(formatted=False)]
#cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=id2word, coherence='u_mass')
cm = CoherenceModel(topics=topics, texts=data_ready, dictionary=id2word, coherence='c_npmi')
print(f"Coherende per topic {cm.get_coherence_per_topic()}")
print(f"Coherence total {cm.get_coherence()}")

In [None]:
corpus_freqs = collections.Counter(itertools.chain.from_iterable(data_ready))
doc_freqs = collections.Counter(itertools.chain.from_iterable(set(doc) for doc in data_ready))
missing = [token for token in corpus_freqs if corpus_freqs[token] == 10 and token not in id2word.id2token]
[(token, corpus_freqs[token], doc_freqs[token]) for token in missing]