In [2]:
"https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0"
"https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html"

import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import gensim

#Visualizations
#import plotly.express as px
#import seaborn as sns
#import pyLDAvis.gensim
#import chart_studio
#import chart_studio.plotly as py 
#import chart_studio.tools as tls

plt.style.use('seaborn')

#data_path = "data/santiago_covid_2020.csv"
#data_path = "data/santiago_enero.csv"
data_path = "data/gabrielboric.csv"



# spanish stop words
stop_words = set(stopwords.words('spanish'))
stop_words = ['t', 'si', 'q', 'https', 'co', 'solo', 'ser', 'bien', 
            'así', 'ma', 'mas', 'igual', 'va', 'después',
            'hacer', 'hace', 'creo'] + list(stop_words)



# **Métodos útiles**

In [3]:
def delete_chars(text, unwanted_chars):
    """
    useful method to replace a list of chars on text.
    return:
        (str) the same str without the chars in chars.
    """
    for char in unwanted_chars:
        text = text.replace(char, '')
    return text

def preprocess_tweet(tweet):
    """
    Replaces unwanted characters and performs
    a preprocessing.
    input: 
        (str) tweet.
    return:
        (str[]) final: list of words.
    """
    unwanted = ['#', ',', '.', '!', '?', '¿', '¡', '(',\
                ')', '-', '=', 'jaja', 'jajaja']
    final = delete_chars(tweet, unwanted).split()
    final = [w.lower() for w in final]
    final = [w for w in final if w not in stop_words and len(w) > 3]
    # Se eliminan links y @users
    final = [w for w in final if w[:4] != 'http']
    final = [w for w in final if w[:1] != '@']
    return final
    
# mini mini test
print(preprocess_tweet('##hola) ¿chao? =) jaja!'))


['hola', 'chao']


# **Tweets**

In [19]:
data = pd.read_csv(data_path)[['date', 'tweet']]
print('---------------------------------------------------------')
print('Sample: ', data.iloc[0]['tweet'])
print('---------------------------------------------------------')
print('Cantidad de tweets: ', len(data))
print('---------------------------------------------------------')

---------------------------------------------------------
Sample:  Hoy nos reunimos con el Rector de la @uvalpochile @ocorralesj para conversar sobre educación superior y los desafíos de las universidades estatales para el nuevo Chile. Nuestro gobierno buscará fortalecer la #educación pública de calidad.  https://t.co/aXvXMWoNPx
---------------------------------------------------------
Cantidad de tweets:  39405
---------------------------------------------------------


In [5]:
preprocess_tweet(data.iloc[0]['tweet'])

['reunimos',
 'rector',
 'conversar',
 'educación',
 'superior',
 'desafíos',
 'universidades',
 'estatales',
 'nuevo',
 'chile',
 'gobierno',
 'buscará',
 'fortalecer',
 'educación',
 'pública',
 'calidad']

# **Preprocess**

In [6]:
# Se pre-procesan los tweets: Esto transforma cada
# tweet en una colección de palabras.
# processed_tweets corresponde a una lista de listas de palabras.

processed_tweets = data['tweet'].map(preprocess_tweet)
processed_tweets.head()

# Se van a eliminar tweets pequeños: con menos de 5 palabras después
# del preprocessing.
dropers = []
for ind, tweet in enumerate(processed_tweets):
    if len(tweet) < 5:
        dropers.append(ind) 

processed_tweets = processed_tweets.drop(dropers)
processed_tweets.sample(5)

25036    [mauro, consulta, cubrieron, inscripción, ayer...
3527     [magallanes, región, aislada, geográficamente,...
39198    [seminario, "chile, bicentenario:, educación",...
24537    [grandes, empresarios, derecha, económica, pin...
34873    [obvio, "centro, plrural", jeje, posibilidades...
Name: tweet, dtype: object

In [7]:
# Se crea el vocabulario. 
# Corresponde a crear una lista con todas las palabras involucrada 
# en el corpus asignando un índice único a cada una.

dictionary = gensim.corpora.Dictionary(processed_tweets)

# Se quitan palabras que aparecen en menos de 20 tweets y las
# que aparecen en más del 50% del total de tweets (?).
dictionary.filter_extremes(no_below=20, no_above=0.5)

c = 0
for k, v in dictionary.iteritems():
    print(k,v)
    c += 1
    if c > 5: break
        
print('Largo del diccionario: ', len(dictionary))

0 calidad
1 chile
2 conversar
3 desafíos
4 educación
5 estatales
Largo del diccionario:  1853


In [8]:
# Se transforman las palabras a vectores con el dictionary.
# bow = "bag of words"

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_tweets]
print('tweet 0: ', bow_corpus[0])
print('tweet 1: ', bow_corpus[1])

# Con esto cada tweet se representa como una colección de tuplas (w, a) 
# donde w es el índice de la palabra y a la cantidad de apariciones en ese
# tweet.

tweet 0:  [(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)]
tweet 1:  [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1)]


In [9]:
# Se crea y usa el lda model
# Running LDA using Bag of Words

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary#.id2token

lda_model = LdaModel(
    corpus=bow_corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)
# LdaMulticore


In [10]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"gobierno" + 0.017*"izquierda" + 0.011*"movimiento" + 0.011*"estudiantil" + 0.010*"hecho" + 0.010*"parte" + 0.010*"dice" + 0.009*"años" + 0.008*"mejor" + 0.008*"nacional"
Topic: 1 
Words: 0.065*"ahora" + 0.053*"magallanes" + 0.039*"vamos" + 0.030*"arenas" + 0.028*"punta" + 0.022*"vivo" + 0.020*"reunión" + 0.017*"radio" + 0.017*"noalalzadelgas" + 0.013*"región"
Topic: 2 
Words: 0.037*"proyecto" + 0.036*"senado" + 0.030*"comisión" + 0.024*"reforma" + 0.019*"caso" + 0.014*"pueden" + 0.013*"feliz" + 0.012*"senuniv" + 0.011*"quiere" + 0.011*"asamblea"
Topic: 3 
Words: 0.025*"educación" + 0.021*"política" + 0.015*"acuerdo" + 0.014*"estudiantes" + 0.012*"derecha" + 0.012*"puede" + 0.012*"debate" + 0.010*"fuerza" + 0.009*"problema" + 0.009*"gente"
Topic: 4 
Words: 0.041*"chile" + 0.024*"abrazo" + 0.022*"gracias" + 0.021*"buena" + 0.019*"mañana" + 0.019*"fech" + 0.017*"recomiendo" + 0.015*"aquí" + 0.015*"universidad" + 0.015*"columna"


In [11]:
from pprint import pprint

top_topics = lda_model.top_topics(bow_corpus) #, num_words=20)
pprint(top_topics)

[([(0.024766594, 'educación'),
   (0.021487072, 'política'),
   (0.015382866, 'acuerdo'),
   (0.013553142, 'estudiantes'),
   (0.01239962, 'derecha'),
   (0.012371122, 'puede'),
   (0.011599787, 'debate'),
   (0.009595124, 'fuerza'),
   (0.009045265, 'problema'),
   (0.008895954, 'gente'),
   (0.008856342, 'comparto'),
   (0.008484627, 'tema'),
   (0.008338762, 'debe'),
   (0.008141142, 'trabajadores'),
   (0.008140414, 'lucro'),
   (0.008033856, 'ayer'),
   (0.007570556, 'mismo'),
   (0.007371407, 'derecho'),
   (0.0071880505, 'cierto'),
   (0.007187807, 'propuesta')],
  -4.469048284996395),
 ([(0.040776458, 'chile'),
   (0.024006506, 'abrazo'),
   (0.022195345, 'gracias'),
   (0.02106733, 'buena'),
   (0.019345408, 'mañana'),
   (0.019023027, 'fech'),
   (0.017286511, 'recomiendo'),
   (0.014723864, 'aquí'),
   (0.014672941, 'universidad'),
   (0.014622271, 'columna'),
   (0.013414747, 'aguante'),
   (0.0128342565, 'muchas'),
   (0.012484553, 'seguir'),
   (0.012404797, 'entrevista')

In [12]:
# Visualizar:

#pyLDAvis.enable_notebook()
#pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)

In [13]:
#from gensim.models import CoherenceModel
# Compute Coherence Score
#coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_tweets, dictionary=dictionary, coherence='c_v')
#coherence_model_lda.get_coherence()