<a href="https://colab.research.google.com/github/jorgecif/CovidMisinformationDetection/blob/main/TFM_MIA_ExtraccionTemaInformacionCOVID.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Extración de temáticas de información relacionada con el COVID19** - Comparación de modelos de aprendizaje automático y aprendizaje profundo


> Por: Jorge Orlando Cifuentes Cifuentes

### Librerías

In [107]:
# Generales
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

# Procesamiento de lenguaje natural
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(88)
import nltk
nltk.download('wordnet')
from sklearn.model_selection import train_test_split


# Gráficas
import matplotlib.pyplot as plt
import seaborn as sns



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Carga de datos

In [108]:
# Carga de datos
url_datos = 'https://github.com/jorgecif/CovidMisinformationDetection/blob/main/data/CovidHeadlinesDataset.xlsx?raw=true' # 1000 datos por categoría

datos = pd.read_excel(url_datos)

print("Se han cargado ", datos.shape[0], " noticias")

# Muestra de los datos
print(len(datos))
datos.head()


Se han cargado  21721  noticias
21721


Unnamed: 0,Text,country,lang,label,URL,date,verificado_por
0,Britain’s Death Toll from the Coronavirus Riva...,United Kingdom,English,True,https://www.snopes.com/ap/2020/05/05/britains-...,2020-02-02 00:00:00,snopes
1,Timeline Reset: CDC Confirms Weeks-Earlier Cal...,United States,English,True,https://www.snopes.com/ap/2020/04/22/timeline-...,2020-06-03 00:00:00,snopes
2,"Dentists, Hair Salons, Beaches: Lockdowns Ease...",United States,English,True,https://www.snopes.com/ap/2020/04/20/dentists-...,2020-08-03 00:00:00,snopes
3,33 Million Have Sought US Unemployment Aid Sin...,United States,English,True,https://www.snopes.com/ap/2020/05/07/33-millio...,2020-09-03 00:00:00,snopes
4,United States ‘Wasted’ Months Before Preparing...,United States,English,True,https://www.snopes.com/ap/2020/04/05/u-s-waste...,2020-03-04 00:00:00,snopes


### Creación de conjuntos de datos

In [109]:
# Reservo unos datos para pruebas finales (datos que no se tocan)

datos_trabajo,datos_reserva = train_test_split(datos, test_size=0.1, random_state=88 )
datos_trabajo=datos_trabajo.reset_index()
datos_trabajo=datos_trabajo.drop(["index"], axis=1)

datos_reserva=datos_reserva.reset_index()
datos_reserva=datos_reserva.drop(["index"], axis=1)

print("Datos para trabajar: ", len(datos_trabajo))
print("Datos reservados para pruebas finales ", len(datos_reserva))

Datos para trabajar:  19548
Datos reservados para pruebas finales  2173


### Creació de lista de documentos

In [110]:
# Creo lista con documentos a procesar
headlines=datos_trabajo["Text"]
docs_train=headlines.values.tolist()

headlines=datos_reserva["Text"]
docs_test=headlines.values.tolist()


In [111]:
# Tallador de los textos
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [112]:
# Preprocesamiento

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result


In [113]:
# Ejemplo

document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [114]:
# Preprocesamiento

processed_docs = []

for doc in docs_train:
    processed_docs.append(preprocess(doc))

In [115]:
# Imprimo documentos procesados

print(processed_docs[:2])

[['woman', 'uttar', 'pradesh', 'throw', 'children', 'river', 'food', 'current', 'lockdown'], ['chart', 'show', 'current', 'case', 'turkey']]


### Bolsa de palabras

In [116]:
# Creo diccionario con GenSim
dictionary = gensim.corpora.Dictionary(processed_docs)


In [117]:
# Creo diccionario

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


0 children
1 current
2 food
3 lockdown
4 pradesh
5 river
6 throw
7 uttar
8 woman
9 case
10 chart


In [118]:
# Remover palabras poco comunes

dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [119]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [120]:
document_num = 4
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 22 ("outbreak") appears 1 time.
Word 23 ("report") appears 1 time.


### Corro LDA usando la bolsa de palabras

In [121]:
# Entrenamiento
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [122]:
# Muestro los temas creados

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.018*"china" + 0.013*"time" + 0.013*"australia" + 0.011*"video" + 0.010*"window" + 0.010*"set" + 0.009*"public" + 0.007*"virus" + 0.007*"current" + 0.007*"play"


Topic: 1 
Words: 0.024*"claim" + 0.023*"video" + 0.020*"show" + 0.016*"post" + 0.013*"facebook" + 0.013*"novel" + 0.011*"share" + 0.011*"lockdown" + 0.011*"cure" + 0.011*"india"


Topic: 2 
Words: 0.018*"virus" + 0.015*"china" + 0.012*"test" + 0.011*"hospit" + 0.009*"australia" + 0.008*"pictur" + 0.007*"patient" + 0.007*"travel" + 0.006*"mask" + 0.006*"symptom"


Topic: 3 
Words: 0.019*"death" + 0.018*"report" + 0.017*"countri" + 0.012*"vaccin" + 0.012*"govern" + 0.012*"number" + 0.010*"wednesday" + 0.010*"friday" + 0.010*"thursday" + 0.009*"monday"


Topic: 4 
Words: 0.033*"trump" + 0.026*"toilet" + 0.025*"paper" + 0.023*"work" + 0.022*"order" + 0.021*"administr" + 0.020*"resid" + 0.020*"rule" + 0.019*"prove" + 0.019*"requir"




### Prueba de clasificación

In [123]:
num = 33
unseen_document = docs_test[num]
print(unseen_document)

Four Australian coronavirus cases confirmedSign up now: Special subscription offer – just $1 for first 28 daysFour South Australians are being tested for the deadly coronavirus. The outbreak started in China, where dozens of people have died and 36 million people are now in lockdown. Four cases have now been confirmed in Australia – one in Victoria and three in New South Wales.The state Health Department is managing four cases but Communicable Disease Control Branch director Louise Flood said the likelihood the patients are actually infected was “low”.Definitive results may not be available for several days.media_cameraPeople wearing masks due to coronavirus walk out of Customs after arriving in Australia from Shanghai. Picture: Chris Pavlich“This afternoon SA public health clinicians, department officials and the Minister for Health and Wellbeing Stephen Wade took part in a national teleconference with Commonwealth and state ministers and officials on the novel coronavirus response,” 

In [124]:
# Preprocesamiento
bow_vector = dictionary.doc2bow(preprocess(unseen_document))



In [125]:
# Aplicación del modelo
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 15)))

Score: 0.9967902898788452	 Topic: 0.018*"china" + 0.013*"time" + 0.013*"australia" + 0.011*"video" + 0.010*"window" + 0.010*"set" + 0.009*"public" + 0.007*"virus" + 0.007*"current" + 0.007*"play" + 0.006*"chines" + 0.006*"advertis" + 0.006*"test" + 0.006*"south" + 0.006*"minist"


### Guardo modelo

In [None]:
## Guardo los modelos en archivos


model_NN.save('model_NN.h5') 

model_CNN.save('model_CNN.h5') 

model_RNN.save('model_RNN.h5') 