# Punto 1 - LDA Approach

In [6]:
import json
import os
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
import ssl
from sklearn.feature_extraction.text import CountVectorizer
import pickle as pk

In [7]:
italian_path = 'datasets/italian/italian_out_hash.json'
spanish_path = 'datasets/spanish/spanish_out_hash.json'
english_path = 'datasets/english/english_out_hash.json'

In [8]:
try:
     _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
     pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')#%% md

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/isabelasarmiento/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## English

In [None]:
# english_path
file = open(english_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
english_raw = pd.json_normalize(data)

print(english_raw.shape)
english_raw.head(10)

In [None]:
'''
Turn text to Numpy Array
'''
texts_column = english_raw.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]


## Reading Files

In [None]:
stemmer = SnowballStemmer("english")
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))

    return result

In [None]:
processed_docs = []

for doc in raw_texts:
    processed_docs.append(preprocess(doc))

In [None]:
'''
Preview 'processed_docs'
'''
print(processed_docs[:10])

In [None]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [None]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [None]:
# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 8,
                                   id2word = dictionary,
                                   passes = 10,
                                   workers = 2)

In [None]:
with open("results_punto1/eng_lda", "wb") as output_file:
    pk.dump(lda_model, output_file)

with open("results_punto1/eng_lda", "rb") as output_file:
    lda_model = pk.load(output_file)

In [5]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.040*"mask" + 0.030*"spread" + 0.020*"wear" + 0.019*"face" + 0.015*"social" + 0.012*"hous" + 0.011*"question" + 0.011*"white" + 0.010*"hand" + 0.010*"stop"


Topic: 1 
Words: 0.047*"trump" + 0.023*"peopl" + 0.021*"realdonaldtrump" + 0.017*"know" + 0.014*"think" + 0.013*"presid" + 0.013*"say" + 0.012*"go" + 0.012*"american" + 0.010*"die"


Topic: 2 
Words: 0.024*"like" + 0.018*"thank" + 0.018*"good" + 0.016*"virus" + 0.014*"time" + 0.014*"look" + 0.014*"peopl" + 0.012*"come" + 0.011*"news" + 0.009*"hope"


Topic: 3 
Words: 0.018*"pandem" + 0.015*"school" + 0.013*"work" + 0.010*"stay" + 0.009*"home" + 0.009*"chang" + 0.008*"student" + 0.008*"communiti" + 0.008*"impact" + 0.008*"safe"


Topic: 4 
Words: 0.018*"pandem" + 0.013*"busi" + 0.011*"help" + 0.010*"govern" + 0.009*"crisi" + 0.009*"support" + 0.009*"need" + 0.008*"economi" + 0.008*"relief" + 0.008*"fund"


Topic: 5 
Words: 0.073*"vaccin" + 0.020*"health" + 0.014*"patient" + 0.012*"say" + 0.011*"studi" + 0.010*"hos

## Spanish

In [4]:
# spanish_path
file = open(spanish_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw = pd.json_normalize(data)

print(text_raw.shape)
text_raw.head(10)

(886351, 4)


Unnamed: 0,id,publication_date,source,text
0,1304391319972790274,1599826000.0,twitter,ORACIÓN DIARIAViernes 11 de Septiembre 2020#or...
1,1308823479077175297,1600883000.0,twitter,Se pudrió todo. Acá la FIFA debe castigar a la...
2,1255929719893221377,1588272000.0,twitter,Última Publicación en la Prensaldia - @ca...
3,1367024221319286784,1614759000.0,twitter,Australia = 0 positivos por coronavirus.¿Vacun...
4,1278689023087849480,1593698000.0,twitter,coronavirus esto ya es personal
5,1374234992969015297,1616478000.0,twitter,"#LadyZopilota, zopiloteando en la noticia."
6,1296604758732570625,1597970000.0,twitter,La noticia que esperaban los mercados. Gracias...
7,1344926278592438272,1609490000.0,twitter,No caigamos en la trampa.En México ya iniciaro...
8,1311046706809786372,1601413000.0,twitter,El coronavirus se ha confirmado ya en más de 2...
9,1257004539875667971,1588528000.0,twitter,@JaimeChincha @RPPNoticias Mi padre acaba de m...


In [5]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

'ORACIÓN DIARIAViernes 11 de Septiembre 2020#oraciondiaria #11DeSeptiembre #BuenosDias #BuenosDiasATodos… '

In [7]:
stemmer = SnowballStemmer("spanish")
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))

    return result

In [8]:
processed_docs = []

for doc in raw_texts:
    processed_docs.append(preprocess(doc))

In [9]:
'''
Preview 'processed_docs'
'''
print(processed_docs[:10])

[['oracion', 'diariaviern', 'septiembr', 'oraciondiari', 'deseptiembr', 'buenosdi'], ['pudr', 'tod', 'fif', 'deb', 'castig', 'conmebol', 'flameng'], ['ultim', 'public', 'prensaldi', 'carlosarthurfot', 'quedateencas', 'yaquedamen', 'yomequedoencas'], ['australi', 'posit', 'coronavirus', 'vacun', 'magi', 'cangur', 'cerr', 'fronter', 'durant', 'confin'], ['coronavirus', 'esto', 'personal'], ['ladyzopilot', 'zopilot', 'notici'], ['notici', 'esper', 'merc', 'graci', 'lanacion'], ['caig', 'tramp', 'mexic', 'inici', 'escane', 'codig', 'par', 'entrar', 'tiend', 'cuid'], ['coronavirus', 'confirm', 'person', 'navarr', 'confin', 'restriccion', 'viruschin'], ['jaimechinch', 'rppnotici', 'padr', 'acab', 'mor', 'caus', 'coronavirus', 'sol', 'dig', 'aprovech', 'cad', 'minut']]


In [10]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [11]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [12]:
# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO

num_topics = 8

lda_model =  gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = num_topics,
                                   id2word = dictionary,
                                   passes = 10,
                                   workers = 2)

In [13]:
with open("results_punto1/es_lda", "wb") as output_file:
    pk.dump(lda_model, output_file)

with open("results_punto1/es_lda", "rb") as output_file:
    lda_model = pk.load(output_file)

Topic: 0 
Words: 0.046*"coronavirus" + 0.020*"com" + 0.018*"per" + 0.016*"hac" + 0.015*"par" + 0.015*"esta" + 0.014*"tod" + 0.013*"esto" + 0.012*"tien" + 0.012*"pas"


Topic: 1 
Words: 0.059*"par" + 0.051*"cov" + 0.016*"med" + 0.016*"salud" + 0.011*"ante" + 0.010*"coronavirus" + 0.009*"trabaj" + 0.009*"prueb" + 0.009*"sanitari" + 0.008*"com"


Topic: 2 
Words: 0.085*"coronavirus" + 0.027*"cov" + 0.015*"chin" + 0.015*"pandemi" + 0.012*"nuev" + 0.011*"estad" + 0.010*"unid" + 0.009*"españ" + 0.008*"crisis" + 0.008*"mund"


Topic: 3 
Words: 0.057*"cov" + 0.024*"coronavirus" + 0.022*"pacient" + 0.020*"hospital" + 0.020*"posit" + 0.015*"medic" + 0.010*"president" + 0.009*"sintom" + 0.009*"mur" + 0.008*"años"


Topic: 4 
Words: 0.072*"vacun" + 0.068*"cov" + 0.064*"contr" + 0.033*"par" + 0.022*"coronavirus" + 0.010*"millon" + 0.009*"primer" + 0.008*"luch" + 0.008*"recib" + 0.007*"investig"


Topic: 5 
Words: 0.029*"tod" + 0.024*"cov" + 0.021*"esta" + 0.013*"nuestr" + 0.013*"graci" + 0.012*"par

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

In [17]:
import pickle

'''
Guardarlo en  una variable especifica
'''
def savePickle(file,obj):
    outfile = open(file,'wb')
    pickle.dump(obj,outfile)
    outfile.close()

  and should_run_async(code)


In [18]:
'''
Guardarlo en  una variable especifica
'''

ita_lda_model = lda_model
savePickle("results_punto1/italian_lda", ita_lda_model )

ita_bow_corpus = bow_corpus
savePickle("results_punto1/ita_bow_corpus", ita_bow_corpus )

ita_dictionary = dictionary
savePickle("results_punto1/ita_dictionary", ita_dictionary )

  and should_run_async(code)


###  Graphical Representantion

In [30]:
import pickle
import pyLDAvis
import os
import pyLDAvis.gensim_models
#import pyLDAvis.gensim

# Visualize the topics
pyLDAvis.enable_notebook()
num_topics = 8

LDAvis_data_filepath = os.path.join('./results_punto1/ldavis_tuned_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results_punto1/ldavis_tuned_'+ str(num_topics) +'.html')

LDAvis_prepared

  and should_run_async(code)


## Italian

In [None]:
# spanish_path
file = open(italian_path, 'r')
data = []
for line in file:
    data.append(json.loads(line))
text_raw = pd.json_normalize(data)

print(text_raw.shape)
text_raw.head(10)

In [None]:
'''
Turn text to Numpy Array
'''
texts_column = text_raw.loc[:,'text']
raw_texts = texts_column.values
raw_texts[0]

In [None]:
stemmer = SnowballStemmer("italian")
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))

    return result

In [None]:
processed_docs = []

for doc in raw_texts:
    processed_docs.append(preprocess(doc))

In [None]:
'''
Preview 'processed_docs'
'''
print(processed_docs[:10])

In [None]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [None]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 8,
                                   id2word = dictionary,
                                   passes = 10,
                                   workers = 2)

In [2]:
with open("results_punto1/ita_lda", "wb") as output_file:
    pk.dump(lda_model, output_file)

with open("results_punto1/ita_lda", "rb") as output_file:
    lda_model = pk.load(output_file)

In [4]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.046*"coronavirus" + 0.020*"com" + 0.018*"per" + 0.016*"hac" + 0.015*"par" + 0.015*"esta" + 0.014*"tod" + 0.013*"esto" + 0.012*"tien" + 0.012*"pas"


Topic: 1 
Words: 0.059*"par" + 0.051*"cov" + 0.016*"med" + 0.016*"salud" + 0.011*"ante" + 0.010*"coronavirus" + 0.009*"trabaj" + 0.009*"prueb" + 0.009*"sanitari" + 0.008*"com"


Topic: 2 
Words: 0.085*"coronavirus" + 0.027*"cov" + 0.015*"chin" + 0.015*"pandemi" + 0.012*"nuev" + 0.011*"estad" + 0.010*"unid" + 0.009*"españ" + 0.008*"crisis" + 0.008*"mund"


Topic: 3 
Words: 0.057*"cov" + 0.024*"coronavirus" + 0.022*"pacient" + 0.020*"hospital" + 0.020*"posit" + 0.015*"medic" + 0.010*"president" + 0.009*"sintom" + 0.009*"mur" + 0.008*"años"


Topic: 4 
Words: 0.072*"vacun" + 0.068*"cov" + 0.064*"contr" + 0.033*"par" + 0.022*"coronavirus" + 0.010*"millon" + 0.009*"primer" + 0.008*"luch" + 0.008*"recib" + 0.007*"investig"


Topic: 5 
Words: 0.029*"tod" + 0.024*"cov" + 0.021*"esta" + 0.013*"nuestr" + 0.013*"graci" + 0.012*"par