In [34]:
#####################################################################################################################
## LDA-COHERENCE-ZILLA  
## Fuentes : https://markroxor.github.io/gensim/static/notebooks/topic_coherence_tutorial.html
##             https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d
##             http://phusewiki.org/wiki/images/c/c9/Weizhong_Presentation_CDER_Nov_9th.pdf
##             The Art and Science of Analyzing Sotfware Data, Christian Bird et al
##             https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html
##             https://www.quora.com/What-should-I-clean-from-text-before-LDA
##             https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
##             https://github.com/derekgreene/topic-model-tutorial/blob/master/3%20-%20Parameter%20Selection%20for%20NMF.ipynb
##             https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
##             https://radimrehurek.com/gensim/models/ldamodel.html
##             https://radimrehurek.com/gensim/models/coherencemodel.html
##             https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-latent-dirichlet-allocation-437c81220158
##             https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
##             https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2
##             https://stats.stackexchange.com/questions/59684/what-are-typical-values-to-use-for-alpha-and-beta-in-latent-dirichlet-allocation
##             https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
##             https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6
#############################################by#######JMG############2020################################################


import pandas as pd
import numpy as np
import re
import gensim.corpora as corpora
import gensim
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import pyLDAvis.gensim
import pickle 
import pyLDAvis




In [45]:
def remove_stopwords(texts):
    nltk.download('stopwords')
    stop_words = stopwords.words('spanish')
    newStopWords = ['coronavirus','chile','casos','detalles','agrego','embargo','forma','semanas','fecha','francia','espana','cada','agregó','parte','caso','dias','respecto','momento','covid','ascom','tikitakas','personas','tipo','tenía','jorge','habrá','habra','medios','perú','casa','cuales',
                'marzo','abril','lunes','semana','despues','hora','también','tambien','martes','radio','lugar','lugares','anuncio','anunciar',
                'miércoles','jueves','viernes','sábado','domingo','enero','comentarios','reservamos','febrero','italia','según','segun','horas','aquí','virus','veces','radio','eliodoro', 'yanez', 'providencia',
                 'reservamos','derecho','fabiola','shtml','pictwittercom','arrow','biobiochile','https_www','bloquear','april','usuarios','agresivos','commons','noticias','indique','contrario','eliminar','comentarios','debate','espacio','wordpress','desarrollado','detallo','estime','conveniente','manteniendo',]
    stop_words.extend(newStopWords)    
    
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts,bigram_mod):
    return [bigram_mod[doc] for doc in texts]


def to_array(texts):
    arreglo_v=[]
    for docv in range(len(texts)):
        for wordv in data_lemmatized[docv].split():
            if wordv is None:
                pass
            else:
                arreglo_v.append(wordv)
    return arreglo_v
def lemma_sustantivos(texts):
    nlp1 = spacy.load("es_core_news_sm")
    text_out = []
    for sent in texts:
        doc = nlp1(" ".join(sent)) 
        text_out.append(" ".join([token.text for token in doc.noun_chunks])) 
    return text_out

def coherence_zilla(corpus,dictionary,dataset,texts,tmin,tmax):
    
    min_topics = tmin
    max_topics = tmax
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)
    alpha = list(np.arange(0.01, 1, 0.3))
    beta  = list(np.arange(0.01,0.1, 0.03))
    num_of_docs = len(corpus)
    max_coherence = 0
    best_model = dict();

    corpus_sets = gensim.utils.ClippedCorpus(corpus, num_of_docs*0.8),
    for k in topics_range:
        print(k)
        for a in alpha:
            for b in beta:
                lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=50,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           per_word_topics=True)
                              
                coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus,texts=texts, coherence='c_v')
                cv = coherence_model_lda.get_coherence()
                if max_coherence < cv:
                    max_coherence=cv
                    best_alpha=a
                    best_beta=b
                    best_topics=k
                    print(max_coherence)
                else:
                    pass
         
                    
      
    best_model['coherence'] = max_coherence
    best_model['alpha']   = best_alpha
    best_model['beta'] = best_beta
    best_model['topics'] =  best_topics
    
    return best_model

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def pre_proceso(df_corpus):
    datos= df_corpus.values.tolist()
    datos_palabras= list(sent_to_words(datos))
    
    bigram = gensim.models.Phrases(datos_palabras,min_count=5) 
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    data_words_nostops = remove_stopwords(datos_palabras)
    data_words_bigrams = make_bigrams(data_words_nostops,bigram_mod)
    data_lemmatized =  lemma_sustantivos(data_words_bigrams)
    return data_lemmatized
            

In [46]:
df_file = pd.read_csv('cleaning_contenido3.csv')

In [47]:
#LIMPIEZA
df_corpus= df_file
df_corpus=df_corpus[1:]
df_corpus=df_corpus.dropna()
df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: re.sub('\s+', ' ', x))
df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: re.sub('[_,";¿*&()=:+#\|@.!?]', ' ', x))
df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: x.lower())
palabras_cortas = re.compile(r'\W*\b\w{1,3}\b')
df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: palabras_cortas.sub('', x))

df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: re.sub('\s+', ' ', x))
df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: re.sub('[_,";¿*&()=:+#\|@.!?]', ' ', x))
df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: x.lower())
palabras_cortas = re.compile(r'\W*\b\w{1,3}\b')
df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: palabras_cortas.sub('', x))
df_corpus['contenido'] = df_corpus['titulos']+" "+df_corpus['contenido']




  df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: re.sub('\s+', ' ', x))
  df_corpus['contenido'] = df_corpus['contenido'].map(lambda x: re.sub('[_,";¿*&()=:+#\|@.!?]', ' ', x))
  df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: re.sub('\s+', ' ', x))
  df_corpus['titulos'] = df_corpus['titulos'].map(lambda x: re.sub('[_,";¿*&()=:+#\|@.!?]', ' ', x))


In [43]:
df_semana_1 = df_corpus[(df_corpus['fecha publicacion'] >= '2020-03-01') & (df_corpus['fecha publicacion'] <= '2020-03-23')]
df_semana_2 = df_corpus[(df_corpus['fecha publicacion'] > '2020-03-23') & (df_corpus['fecha publicacion'] <= '2020-03-30')]
df_semana_3 = df_corpus[(df_corpus['fecha publicacion'] > '2020-03-30') & (df_corpus['fecha publicacion'] <= '2020-04-07')]
df_semana_4 = df_corpus[(df_corpus['fecha publicacion'] > '2020-04-07') & (df_corpus['fecha publicacion'] <= '2020-04-14')]

semanas = [df_semana_1,df_semana_2,df_semana_3,df_semana_4]
s=1
best_model = []
best_corpus=[]
best_id2word=[]
for semana in semanas:
    print(s)
    s_lemma = pre_proceso(semana)
    dataset = [d.split() for d in s_lemma]
    id2word = corpora.Dictionary(dataset)
    texts = dataset
    corpus = [id2word.doc2bow(text) for text in texts]
    best_model_param = coherence_zilla(corpus,id2word,dataset,texts)
    best_corpus.append(corpus)
    best_id2word.append(id2word)
    best_model.append(best_model_param)
    s+=1


1


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmerc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)
  return _unpackb(packed, **kwargs)


10
11
12


UnboundLocalError: local variable 'best_alpha' referenced before assignment

In [26]:
############# Visualización - Semana 1
s1_model=best_model[0]
lda_model_s1 = gensim.models.LdaMulticore(corpus=best_corpus[0],
                                       id2word=best_id2word[0],
                                       num_topics=s1_model.get('topics'), 
                                       alpha=s1_model.get('alpha'), 
                                       eta=s1_model.get('beta'), 
                                       random_state=100,
                                       chunksize=50,
                                       passes=10,
                                       per_word_topics=True)
    
doc_lda = lda_modelg[corpus]
pyLDAvis.enable_notebook()
LDAvis_prepared_1 = pyLDAvis.gensim.prepare(lda_model_s1, best_corpus[0], best_id2word[0])
LDAvis_prepared_1






In [27]:
############# Visualización - Semana 2
s2_model=best_model[1]
lda_model_s2 = gensim.models.LdaMulticore(corpus=best_corpus[1],
                                       id2word=best_id2word[1],
                                       num_topics=s2_model.get('topics'), 
                                       alpha=s2_model.get('alpha'), 
                                       eta=s2_model.get('beta'), 
                                       random_state=100,
                                       chunksize=50,
                                       passes=10,
                                       per_word_topics=True)
    
doc_lda = lda_modelg[corpus]

LDAvis_prepared_2 = pyLDAvis.gensim.prepare(lda_model_s2, best_corpus[1], best_id2word[1])
LDAvis_prepared_2

IndexError: list index out of range

In [None]:
############# Visualización - Semana 3
s3_model=best_model[2]
lda_model_s3 = gensim.models.LdaMulticore(corpus=best_corpus[2],
                                       id2word=best_id2word[2],
                                       num_topics=s3_model.get('topics'), 
                                       alpha=s3_model.get('alpha'), 
                                       eta=s3_model.get('beta'), 
                                       random_state=100,
                                       chunksize=50,
                                       passes=10,
                                       per_word_topics=True)
    
doc_lda = lda_modelg[corpus]

LDAvis_prepared_3 = pyLDAvis.gensim.prepare(lda_model_s3, best_corpus[2], best_id2word[2])
LDAvis_prepared_3

In [None]:
############# Visualización - Semana 4
s4_model=best_model[3]
lda_model_s4 = gensim.models.LdaMulticore(corpus=best_corpus[3],
                                       id2word=best_id2word[3],
                                       num_topics=s4_model.get('topics'), 
                                       alpha=s4_model.get('alpha'), 
                                       eta=s4_model.get('beta'), 
                                       random_state=100,
                                       chunksize=50,
                                       passes=10,
                                       per_word_topics=True)
    
doc_lda = lda_modelg[corpus]

LDAvis_prepared_4 = pyLDAvis.gensim.prepare(lda_model_s4, best_corpus[3], best_id2word[3])
LDAvis_prepared_4