In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from pymongo import MongoClient
from nltk.tokenize import TweetTokenizer

import gensim
import pandas as pd
from gensim.models.ldamodel import LdaModel
from gensim import corpora, models
from stop_words import get_stop_words
import snowballstemmer

In [2]:
try:
    client = MongoClient()
    print("Connected to MongoDB\n")
except pymongo.errors.ConnectionFailure as e:
    print("Could not connect to MongoDB",e)

Connected to MongoDB



In [3]:
db = client.sept19_26_keywords_db
tweets = db.sept19_26_keywords_collection

In [4]:
tweets.find().count()

40665

# Tokenizer for Tweets

In [5]:
tknzr = TweetTokenizer(preserve_case=False,       # Convertir a minúsculas
                       reduce_len=True,           # Reducir caracteres repetidos
                       strip_handles=False)       # Mostrar @usuarios

# Create Stemmer of class Snowball

In [6]:
stmmr = snowballstemmer.stemmer('Spanish')

# Stopword

In [7]:
spec_chars = ["…",'"',"“","/","(",")","[","]","?","¿","!","¡",
                 "rt",":","…",",","\n","#","\t","",".","$",
                 "...","-","🤢"]

# create English stop words list
en_stop = get_stop_words('es')

In [8]:
def clean_tweet(tmp_tweet):
    """
    Eliminar tokens que:
    - Estén dentro de lista_de_paro.
    - Sean ligas.
    - Si es una mención i.e @potus, se cambia por token genérico @usuario.
    """
    return [_ for _ in tmp_tweet 
            if _ not in spec_chars 
            and not _.startswith(('http', 'htt'))]

In [9]:
nuevos_tweets = []
for i in tweets.find():
    if "retweeted_status" in i:           # Si es retweet...
        tokens = tknzr.tokenize(i["retweeted_status"]['text'])
        stopped_tokens = [i for i in tokens
                         if not i in en_stop]
        stemmed_tokens = clean_tweet(stopped_tokens)
        nuevos_tweets.append(stemmed_tokens)

    else:                                # Si no es retweet...
        tokens = tknzr.tokenize(i['text'])
        stopped_tokens = [i for i in tokens
                         if not i in en_stop]
        stemmed_tokens = clean_tweet(stopped_tokens)
        nuevos_tweets.append(stemmed_tokens)


In [10]:
df = pd.DataFrame()

In [11]:
df["tweet"] = nuevos_tweets

In [12]:
df.head(100)

Unnamed: 0,tweet
0,"[sabe, hacer, caso, sismo, aquí, siguientes, r..."
1,"[@licalbarran, cada, ser, humano, ayuda, crece..."
2,"[bien, quieres, cerca, necesitas, ahora, olvid..."
3,"[mañana, llevará, cabo, #simulacrocdmx, 11:00,..."
4,"[suspende, activación, #alertasísmica, #simula..."
5,"[#yosoymara, cansada, tener, miedo, simple, he..."
6,"[centro, acopio, ayuda, damnificados, sismo, p..."
7,"[favor, acuérdense, mañana, simulacro, sismo, ..."
8,"[@andaestadiorg, centro, acopio, apoyar, pobla..."
9,"[#comunicado, suspende, activación, alerta, sí..."


# turn our tokenized documents into a id <-> term dictionary

In [13]:
dictionary = corpora.Dictionary(df["tweet"])

# convert tokenized documents into a document-term matrix

In [15]:
corpus = [dictionary.doc2bow(text) for text in df["tweet"]]

# generate LDA model

In [16]:
ldamodel = LdaModel(corpus,
                    num_topics = 20,
                    id2word = dictionary,
                    passes = 20,
                    minimum_probability = 0.05)

In [17]:
print(ldamodel)

LdaModel(num_terms=37385, num_topics=20, decay=0.5, chunksize=2000)


In [18]:
print(ldamodel.print_topics())

[(0, '0.118*"sismo" + 0.029*"#fuerzamexico" + 0.021*"tras" + 0.020*"#mexicounido" + 0.020*"después" + 0.013*"méxico" + 0.012*"septiembre" + 0.011*"hoy" + 0.011*"ser" + 0.010*"nuevo"'), (1, '0.174*"#fuerzamexico" + 0.103*"#sismo" + 0.049*"#cdmx" + 0.034*"#mexicoestadepie" + 0.027*"#sismocdmx" + 0.021*"#mexico" + 0.019*"#sismomexico2017" + 0.015*"#prayformexico" + 0.014*"#fuertemexico" + 0.012*".."'), (2, '0.155*"🇲" + 0.154*"🇽" + 0.079*"#fuerzamexico" + 0.016*"💪" + 0.014*"#mexicotequiero" + 0.013*"méxico" + 0.012*"🏼" + 0.012*"siempre" + 0.012*"️" + 0.010*"si"'), (3, '0.039*"víveres" + 0.023*"#sismo" + 0.022*"seguimos" + 0.018*"|" + 0.017*"recibiendo" + 0.016*"personas" + 0.015*"apoyo" + 0.014*"hola" + 0.013*"damnificados" + 0.013*"siguen"'), (4, '0.045*"ayuda" + 0.030*"️" + 0.028*"#fuerzaméxico" + 0.027*"d" + 0.021*"🚨" + 0.018*"🙌" + 0.018*"difundir" + 0.017*"⚠" + 0.012*"mil" + 0.012*"m"'), (5, '0.066*"sísmica" + 0.064*"alerta" + 0.063*"gracias" + 0.030*"alarma" + 0.018*"parque" + 0.016*"

In [21]:
ldamodel.save(fname="ldamodel")