In [12]:
from my_weapon import *
from tqdm import tqdm_notebook as tqdm
import matplotlib
from collections import Counter

In [13]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
nlp = spacy.load('es', disable=['parser', 'ner'])

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [14]:
import unicodedata

def normalize_lower(text):
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode().lower()

In [15]:
stop_words = json.load(open("data/spanish-stop-words.json"))["words"]
stop_words = [normalize_lower(w) for w in stop_words]
stop_words.extend([
    "rt", "…", "...", "URL", "http", "https", "“", "”", "‘", "’", "get", "2", "new", "one", "i'm", "make",
    "go", "good", "say", "says", "know", "day", "..", "take", "got", "1", "going", "4", "3", "two", "n",
    "like", "via", "u", "would", "still", "first", "that's", "look", "way", "last", "said", "let",
    "twitter", "ever", "always", "another", "many", "things", "may", "big", "come", "keep", "RT",
    "5", "time", "much", "_", "cound", "-", '"'
])
stop_words.extend([',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '|'])
stop_words = set(stop_words)

In [16]:
texts_out = []    
for line in tqdm(open("data/LDA_corpus.txt")):
    ## words = [w for w in line.strip().split() if w not in stop_words and len(w) > 1]
    words = [w for w in line.strip().split() if w not in stop_words and len(w) > 1]
    texts_out.append(words)

#         # Create Dictionary
#         self.id2word = corpora.Dictionary(texts_out)

#         # Create Corpus
#         self.texts = texts_out

#         # Term Document Frequency
#         self.corpus = [self.id2word.doc2bow(text) for text in texts_out]

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [15]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
# nlp = spacy.load('en')
import re

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    i = 0
    for sent in tqdm(texts):
        sent = " ".join(sent)
        sent = re.sub(r'#(\w+)', r'itstopiczzz\1', sent)
        sent = re.sub(r'@(\w+)', r'itsmentionzzz\1', sent)
        doc = nlp(sent)
        
        _d = [token.lemma_ for token in doc if token.pos_ in allowed_postags and token.lemma_ not in stop_words and token.lemma_]
#         _d = [(token.pos_, token.lemma_) for token in doc if token.lemma_ not in stop_words]
        
        _d = [x.replace('itstopiczzz', '#') for x in _d]
        _d = [x.replace('itsmentionzzz', '@') for x in _d]
        texts_out.append(_d)

    return texts_out

texts_out = lemmatization(texts_out)

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(texts_out)

# Create Corpus
texts = texts_out

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]]


In [7]:
# train new
from gensim.models import CoherenceModel, LdaModel

lda_model = LdaModel.load("data/LDA-5.mod")
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=7, chunksize=1000, random_state=43)

In [18]:
lda_model.print_topics()

[(0,
  '0.079*"lopez-gatell" + 0.028*"pandemia" + 0.024*"cubrebocas" + 0.023*"dr" + 0.017*"salud" + 0.015*"@sinlineamx" + 0.015*"vacuna" + 0.013*"carta" + 0.012*"obrador" + 0.012*"@lillytellez"'),
 (1,
  '0.043*"hugo" + 0.018*"te" + 0.012*"tu" + 0.012*"mexicanos" + 0.010*"cul" + 0.010*"@brozoxmiswebs" + 0.010*"peda" + 0.010*"envie" + 0.008*"@nachorgz" + 0.007*"cientifico"'),
 (2,
  '0.172*"lopez" + 0.081*"mexico" + 0.048*"gobernadores" + 0.037*"covid" + 0.033*"19" + 0.026*"@rochaperiodista" + 0.021*"COVID" + 0.019*"fernandez" + 0.013*"personas" + 0.011*"hora"'),
 (3,
  '0.038*"presidente" + 0.019*"dias" + 0.016*"comercial" + 0.016*"casa" + 0.016*"@lopezobrador_" + 0.015*"q" + 0.013*"peores" + 0.013*"marzo" + 0.011*"estrategia" + 0.011*"|"'),
 (4,
  '0.191*"gatell" + 0.025*"renuncia" + 0.023*"mil" + 0.022*"muertos" + 0.021*"culpa" + 0.020*"A" + 0.019*"Y" + 0.015*"muertes" + 0.014*"DE" + 0.013*"piden"'),
 (5,
  '0.030*"fiesta" + 0.028*"super" + 0.024*"frente" + 0.020*"p" + 0.020*"ah" + 0

In [None]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Perplexity:  -18.37148673763343


In [19]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

In [None]:
pyLDAvis.save_html(vis, 'data/lda.html')

In [None]:
vis

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        # model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
lda_rst = {}

for line in open("data/lda_rst.txt"):
    '''
    Num Topics = 20  has Coherence Value of 0.3005
    '''
    if not line.startswith("Num"):
        continue
    w = line.strip().split()
    N = int(w[3]); score = float(w[-1])
    if N not in lda_rst:
        lda_rst[N] = []
    lda_rst[N].append(score)
# print(lda_rst)

In [None]:
new_lda_rst = []
for N, scores in lda_rst.items():
    for s in scores:
        new_lda_rst.append({"N": N, "Coherence": s})

In [None]:
new_lda_rst = pd.DataFrame(new_lda_rst)

In [None]:
plt.figure(figsize=(9, 6))
sns.lineplot(x="N", y="Coherence", data=new_lda_rst)

In [None]:
# After choosing the best model

lda_cohen = []

for line in open("lda_loglog.txt"):
    if line.startswith("Coherence Score: "):
        lda_cohen.append(float(line.strip().split()[-1]))

In [None]:
max(lda_cohen)

In [None]:
from gensim.models import CoherenceModel, LdaModel

lda_model = LdaModel.load("model/lda-ira-78.mod") # best model!