In [215]:
import os
import pickle

In [216]:
import pandas as pd
import numpy as np
import glob
from gensim import corpora, models, similarities

from tqdm import tqdm

import re
import string
import sys
import pickle
# def remove_non_ascii(row):
#     return row.encode('ascii', 'ignore').decode('utf-8')

my_punctuation = '[' + re.escape(''.join(set(string.punctuation).union([' ', '\t']).difference(['-', '_', '']))) + ']'
table = dict.fromkeys(i for i in range(sys.maxunicode) if chr(i) in my_punctuation)

def clean_word(word):
    # return word
    return word.lower().translate(table)

In [217]:
articles = {}
for fn in glob.glob('*.pkl'):
    with open(fn, 'rb') as f:
        arts = pickle.load(f)
        articles.update(arts)

In [218]:
len(articles)

1941

In [219]:
texts = []
to_remove = []
for url, art in articles.items():
    text = art['text']
    text = text.replace('\n', '')
    words = re.split(my_punctuation, text)
    text = [clean_word(w) for w in words if len(w) > 2]
    if 'coronavirus' in text or 'covid' in text or 'covid-19' in text or 'virus' in text:
        texts.append(text)
    else:
        to_remove.append(url)

for u in to_remove:
    del articles[u]

In [221]:
len(texts), len(articles)

(481, 481)

In [222]:
stop_words = ['http', 'https', 'unicef', 'nih', 'facebook', 'google', 'you', 'your', 'www', 'email', 'twitter']

In [225]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.4)
# dictionary.filter_tokens(bad_ids=list(map(lambda w: dictionary.token2id[w], 
#                                          filter(lambda w: w in dictionary.token2id, 
#                                                                                          stop_words))))

In [226]:
len(dictionary.token2id)

1948

In [227]:
token2id = pd.Series(dictionary.token2id)
id2token = pd.Series(token2id.index, index=token2id)
cfs = pd.Series(dictionary.cfs)
for cfss in cfs.sort_values(ascending=False).index[:50]:
    if cfss in id2token:
        print(id2token[cfss])
# id2token[]

cultural
interim
feel
active
investigation
change
reduce
support
rooms
against
middle
identify
hard
share
into
based
recommended
regulations
puis
equipped
washing
tools
every
adults
visit
sneezes
addition
called
sick
age
pui
possible
means
time
treatment
additional
effectively
signs
cases
benefit
urgent
it’s
learning
gloves
asymptomatic
ill
hcp
end


In [241]:
tfidf = models.TfidfModel(dictionary=dictionary, normalize=True)
corpus = [tfidf[dictionary.doc2bow(text)] for text in texts]
lsi = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=50)
index = similarities.MatrixSimilarity(lsi[corpus])

In [242]:
lsi.print_topics()

[(0,
  '0.175*"you" + 0.159*"ebola" + 0.147*"your" + 0.145*"countries" + 0.135*"cases" + 0.105*"should" + 0.101*"china" + 0.098*"pacific" + 0.097*"nih" + 0.088*"our"'),
 (1,
  '0.442*"https" + 0.430*"www" + 0.423*"nih" + 0.305*"latest" + 0.269*"govget" + 0.233*"gov" + 0.182*"evolving" + 0.169*"emerging" + 0.152*"cdc" + 0.148*"rapidly"'),
 (2,
  '0.522*"pacific" + 0.298*"covid-19in" + 0.273*"named" + 0.271*"equipped" + 0.268*"scaling" + 0.256*"newly" + 0.198*"threat" + 0.195*"ready" + 0.179*"detect" + 0.172*"steps"'),
 (3,
  '-0.356*"ebola" + 0.286*"your" + 0.254*"you" + -0.234*"committee" + -0.218*"democratic" + -0.197*"republic" + -0.187*"congo" + -0.169*"2005" + -0.162*"statement" + -0.156*"meeting"'),
 (4,
  '-0.179*"you" + 0.172*"should" + 0.172*"ebola" + -0.169*"countries" + 0.154*"healthcare" + 0.141*"hcp" + -0.137*"china" + -0.135*"our" + -0.135*"we’re" + 0.118*"respiratory"'),
 (5,
  '-0.405*"you" + -0.394*"your" + -0.185*"committee" + -0.177*"statement" + -0.177*"democratic" +

In [243]:
query = 'We also are dealing with the consequences of global shortages of medical supplies'

In [244]:
def do_query(query):
    query_text = re.split(my_punctuation, query)
    query_text = [clean_word(w) for w in query_text]
    print(query_text)
    vec_bow = dictionary.doc2bow(query_text)
    vec_lsi = lsi[tfidf[vec_bow]] # convert the query to LSI space
    sims = index[vec_lsi]
    vv = pd.Series(sims, index=list(articles.keys()))
    for name in vv.sort_values(ascending=False).index[:10]:
        print(name)
        print('Similarity: ', np.round(vv.loc[name], 3))
        print(articles[name]['summary'])

In [238]:
do_query(query)

['we', 'also', 'are', 'dealing', 'with', 'the', 'consequences', 'of', 'global', 'shortages', 'of', 'medical', 'supplies']
https://www.who.int/news-room/detail/03-03-2020-shortage-of-personal-protective-equipment-endangering-health-workers-worldwide
Similarity:  0.606
Healthcare workers rely on personal protective equipment to protect themselves and their patients from being infected and infecting others.
“Without secure supply chains, the risk to healthcare workers around the world is real.
WHO has so far shipped nearly half a million sets of personal protective equipment to 47 countries,* but supplies are rapidly depleting.
This includes easing restrictions on the export and distribution of personal protective equipment and other medical supplies.
Every day, WHO is providing guidance, supporting secure supply chains, and delivering critical equipment to countries in need.
https://www.doctorswithoutborders.org/what-we-do/news-stories/news/msf-support-urgently-needed-across-europe-prote