In [1]:
import os
import pickle

In [23]:
import pandas as pd
import numpy as np
import glob
from gensim import corpora, models, similarities

from tqdm import tqdm

import re
import string
import sys
import pickle
# def remove_non_ascii(row):
#     return row.encode('ascii', 'ignore').decode('utf-8')

my_punctuation = '[' + re.escape(''.join(set(string.punctuation).union([' ', '\t']).difference(['-', '_', '']))) + ']'
table = dict.fromkeys(i for i in range(sys.maxunicode) if chr(i) in my_punctuation)

def clean_word(word):
    # return word
    return word.lower().translate(table)

In [196]:
articles = {}
for fn in glob.glob('*.pkl'):
    with open(fn, 'rb') as f:
        arts = pickle.load(f)
        articles.update(arts)

In [197]:
len(articles)

1640

In [198]:
texts = []
to_remove = []
for url, art in articles.items():
    text = art['text']
    text = text.replace('\n', '')
    words = re.split(my_punctuation, text)
    text = [clean_word(w) for w in words if len(w) > 2]
    if 'coronavirus' in text or 'covid' in text or 'covid-19' in text or 'virus' in text:
        texts.append(text)
    else:
        to_remove.append(url)

In [199]:
for u in to_remove:
    del articles[u]

In [200]:
len(texts), len(articles)

(289, 289)

In [201]:
stop_words = ['http', 'https', 'unicef', 'nih', 'facebook', 'google', 'you', 'your', 'www', 'email', 'twitter']

In [204]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.4)
# dictionary.filter_tokens(bad_ids=list(map(lambda w: dictionary.token2id[w], 
#                                          filter(lambda w: w in dictionary.token2id, 
#                                                                                          stop_words))))

In [205]:
len(dictionary.token2id)

2182

In [206]:
token2id = pd.Series(dictionary.token2id)
id2token = pd.Series(token2id.index, index=token2id)
cfs = pd.Series(dictionary.cfs)
for cfss in cfs.sort_values(ascending=False).index[:50]:
    if cfss in id2token:
        print(id2token[cfss])
# id2token[]

there
application
considerations
información
inglés
submitted
secondary
they
support
more
much
activities
needed
conditions
outbreak
even
hora
pressure
measures
about
may
assist
prevent
all
expected
body
blood
spread
sign
mouth
complications
new
rápidamente
address
implementing
results
household
protective
method
coughs
parts
continuarán
families
monday
authority
patient’s
serious
up-to-date


In [207]:
tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)
corpus = [tfidf[dictionary.doc2bow(text)] for text in texts]
lsi = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=50)
index = similarities.MatrixSimilarity(lsi[corpus])

In [208]:
lsi.print_topics()

[(0,
  '0.439*"los" + 0.357*"que" + 0.271*"para" + 0.193*"rápidamente" + 0.184*"salud" + 0.182*"vez" + 0.175*"por" + 0.172*"información" + 0.116*"las" + 0.099*"actuar"'),
 (1,
  '0.432*"https" + 0.404*"nih" + 0.399*"www" + 0.371*"latest" + 0.269*"govget" + 0.215*"evolving" + 0.209*"rapidly" + 0.205*"gov" + 0.198*"research" + 0.188*"emerging"'),
 (2,
  '0.249*"you" + 0.242*"your" + 0.145*"should" + 0.119*"patients" + 0.115*"people" + 0.108*"children" + 0.106*"healthcare" + 0.101*"hands" + 0.100*"home" + 0.094*"not"'),
 (3,
  '0.382*"you" + 0.291*"your" + -0.205*"patients" + -0.162*"hcp" + -0.142*"patient" + -0.135*"healthcare" + -0.129*"ppe" + -0.129*"should" + -0.121*"con" + -0.110*"personnel"'),
 (4,
  '0.276*"con" + 0.234*"las" + 0.227*"del" + 0.214*"vacuna" + 0.195*"puede" + 0.182*"que" + 0.181*"vih" + 0.158*"you" + 0.155*"pueden" + 0.137*"los"'),
 (5,
  '0.238*"your" + 0.215*"you" + -0.166*"children" + 0.141*"hands" + -0.136*"researchers" + -0.118*"our" + 0.108*"should" + -0.102*"p

In [212]:
query = 'We also are dealing with the consequences of global shortages of medical supplies'

In [213]:
def do_query(query):
    query_text = re.split(my_punctuation, query)
    query_text = [clean_word(w) for w in query_text]
    print(query_text)
    vec_bow = dictionary.doc2bow(query_text)
    vec_lsi = lsi[tfidf[vec_bow]] # convert the query to LSI space
    sims = index[vec_lsi]
    vv = pd.Series(sims, index=list(articles.keys()))
    for name in vv.sort_values(ascending=False).index[:10]:
        print(name)
        print('Similarity: ', np.round(vv.loc[name], 3))
        print(articles[name]['summary'])

In [214]:
do_query(query)

['we', 'also', 'are', 'dealing', 'with', 'the', 'consequences', 'of', 'global', 'shortages', 'of', 'medical', 'supplies']
https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cdc-in-action.html
Similarity:  0.63
CDC is aggressively responding to the global outbreak of COVID-19 and preparing for the potential of community spread in the United States.
https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/index.html
Similarity:  0.58
This is an emerging, rapidly evolving situation and CDC will provide updated information as it becomes available, in addition to updated guidance.
CDC is aggressively responding to the global outbreak of COVID-19 and preparing for the potential of community spread in the United States.
https://www.doctorswithoutborders.org/what-we-do/news-stories/news/covid-19-outbreak-crisis-update-march-13
Similarity:  0.564
On March 11, the COVID-19 outbreak was declared a pandemic by the World Health Organization.
The high level of supportive and intensive care requi