# COVID-19 Research Papers LDA Clustering

In [62]:
import numpy as np
import pandas as pd
import json
import itertools
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import nltk
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

import re

import gensim
from gensim import corpora, models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jayfeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jayfeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
"""Reads in abstracts.csv and filters out rows with missing values."""

df = pd.read_csv("abstracts.csv")
df = df[df["abstract"] != "NaN"]
df = df.dropna()

In [3]:
# def tokenize_func(row):
#     return word_tokenize(row[2])

In [4]:
# df["abstract tokens"] = df.apply(lambda row: tokenize_func(row), axis=1)

In [5]:
# lemmatizer = WordNetLemmatizer() 

# def lemmatize_func(row):
#     counter = 0
#     while counter < len(row[3]):
#         row[3][counter] = lemmatizer.lemmatize(row[3][counter])
#         counter += 1

In [6]:
# df["abstract tokens"] = df.apply(lambda row: tokenize_func(row), axis=1)

In [7]:
stop_words = set(stopwords.words('english')) 
snowBallStemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [8]:
sample_abstract = 'Abstract Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were evaluated for immunologic reactivity to previous antigenic challenges, including primary immunization with a foreign antigen, keyhole limpet hemocyanin (KLH). These two women had lived together for all of their 58 years and neither was receiving anti-inflammatory or immunosuppressive drugs at the time of these studies. Both twins demonstrated comparable 7S and 198 humoral antibody response to KLH, as well as similar viral antibody titers. However, the twin with SLE was anergic to common antigens, streptokinase-streptodornase, Trichophyton and Candida; furthermore delayed hypersensitivity to KLH did not develop after immunization. This observed discrepancy between humoral and cellular immunity in genetically similar subjects may be significant in the pathogenesis of SLE.'

In [14]:
def tokenize_clean(abstract):
    #tokenizes abstract string
    tokens = word_tokenize(abstract.lower())
    
    #lemmatizes tokens
    counter = 0
    while counter < len(tokens):
        tokens[counter] = lemmatizer.lemmatize(tokens[counter])
        counter += 1
    
    #filters, stems, and lowercases tokens
    filtered_tokens = []
    for i in tokens:
        if i not in stop_words and len(i) > 3 and i != "abstract":
            stemmed_word = snowBallStemmer.stem(i)
            filtered_tokens.append(stemmed_word)
    
    return filtered_tokens

In [15]:
df["abstract tokens"] = df.apply(lambda row: tokenize_clean(row.abstract), axis=1)

In [16]:
df

Unnamed: 0.1,Unnamed: 0,sha,abstract,abstract tokens
0,3,aecbc613ebdab36753235197ffb4f35734b5ca63,"Abstract Middle-aged female identical twins, o...","[middle-ag, femal, ident, twin, system, lupus,..."
1,5,212e990b378e8d267042753d5f9d4a64ea5e9869,Abstract Our understanding of the pathogenesis...,"[understand, pathogenesi, infecti, especi, bac..."
2,6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Abstract In the pathogenesis of rheumatoid art...,"[pathogenesi, rheumatoid, arthriti, local, pro..."
3,7,ddd2ecf42ec86ad66072962081e1ce4594431f9c,"Abstract Pharyngitis, bronchitis, and pneumoni...","[pharyng, bronchiti, pneumonia, repres, common..."
4,8,a55cb4e724091ced46b5e55b982a14525eea1c7e,"Abstract Acute bronchitis, an illness frequent...","[acut, bronchiti, ill, frequent, encount, prim..."
...,...,...,...,...
27685,40508,179df1e769292dd113cef1b54b0b43213e6b5c97,"Background/introduction COVID−19, a novel coro...","[background/introduct, covid−19, novel, corona..."
27686,40509,9b4445849937393a4b05378653521a9d0c34dc8e,Governments around the world must rapidly mobi...,"[govern, around, world, must, rapid, mobil, ma..."
27687,40510,4e618ec5d2edea031a9ff8058a9bafafe30937be,The 2019-Novel-Coronavirus (COVID-19) has affe...,"[2019-novel-coronavirus, covid-19, affect, cou..."
27688,40511,28b53e0cab53b10ab87431d6cc4ac1e0a7c4d6b9,Object Meteorological parameters are the impor...,"[object, meteorolog, paramet, import, factor, ..."


In [37]:
texts = []
for index, row in df.iterrows():
    texts.append(row[3])
    

In [41]:
dictionary = corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

In [70]:
#print(ldamodel.print_topics(num_topics=20, num_words=5))

In [None]:
[(0, '0.044*"hcov" + 0.029*"measl" + 0.026*"facial" + 0.021*"hcov-nl63" + 0.018*"rhiniti"'),
 (1, '0.080*"virus" + 0.047*"infect" + 0.034*"respiratori" + 0.030*"viral" + 0.020*"detect"'),
 (2, '0.052*"cell" + 0.031*"infect" + 0.021*"respons" + 0.019*"express" + 0.019*"immun"'),
 (3, '0.165*"januari" + 0.094*"decemb" + 0.022*"affili" + 0.013*"aviat" + 0.011*"thailand"'),
 (4, '0.075*"patient" + 0.026*"sever" + 0.026*"clinic" + 0.020*"diseas" + 0.015*"treatment"'),
 (5, '0.040*"activ" + 0.037*"antivir" + 0.028*"effect" + 0.026*"drug" + 0.021*"inhibit"'),
 (6, '0.051*"model" + 0.017*"number" + 0.017*"epidem" + 0.013*"popul" + 0.013*"estim"'),
 (7, '0.042*"sequenc" + 0.030*"gene" + 0.028*"strain" + 0.027*"genom" + 0.018*"analysi"'),
 (8, '0.022*"diseas" + 0.014*"review" + 0.013*"develop" + 0.011*"pathogen" + 0.011*"system"'),
 (9, '0.035*"use" + 0.033*"detect" + 0.028*"assay" + 0.025*"method" + 0.024*"test"'),
 (10, '0.093*"pedv" + 0.073*"porcin" + 0.046*"swine" + 0.043*"diarrhea" + 0.043*"piglet"'),
 (11, '0.028*"health" + 0.024*"outbreak" + 0.016*"public" + 0.014*"china" + 0.014*"diseas"'),
 (12, '0.088*"korea" + 0.051*"felin" + 0.019*"lesion" + 0.015*"spectrometri" + 0.014*"periton"'),
 (13, '0.090*"vaccin" + 0.056*"antibodi" + 0.036*"immun" + 0.021*"protect" + 0.019*"respons"'),
 (14, '0.039*"case" + 0.033*"hospit" + 0.028*"patient" + 0.020*"rate" + 0.013*"infect"'),
 (15, '0.034*"studi" + 0.029*"use" + 0.020*"data" + 0.019*"method" + 0.018*"result"'),
 (16, '0.041*"protein" + 0.024*"virus" + 0.015*"cell" + 0.015*"viral" + 0.011*"structur"'),
 (17, '0.087*"temperatur" + 0.060*"heat" + 0.044*"inactiv" + 0.028*"skin" + 0.027*"humid"'),
 (18, '0.044*"group" + 0.025*"blood" + 0.016*"calf" + 0.016*"signific" + 0.015*"level"'),
 (19, '0.072*"influenza" + 0.022*"pandem" + 0.021*"particip" + 0.019*"health" + 0.016*"work"')]





In [72]:
partial_df = df.sample(2000)

In [73]:
partial_texts = []
for index, row in partial_df.iterrows():
    partial_texts.append(row[3])

In [74]:
dictionary = corpora.Dictionary(partial_texts)
dictionary.filter_extremes(no_below=3)


corpus = [dictionary.doc2bow(text) for text in partial_texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=20)

In [75]:
print(ldamodel.print_topics(num_topics=15, num_words=5))

[(0, '0.037*"patient" + 0.023*"case" + 0.016*"sar" + 0.009*"respiratori" + 0.008*"sever"'), (1, '0.040*"patient" + 0.023*"care" + 0.016*"mortal" + 0.013*"unit" + 0.012*"intens"'), (2, '0.034*"protein" + 0.015*"virus" + 0.015*"sequenc" + 0.013*"genom" + 0.012*"viral"'), (3, '0.033*"virus" + 0.026*"influenza" + 0.021*"h1n1" + 0.014*"respiratori" + 0.014*"infect"'), (4, '0.021*"health" + 0.015*"diseas" + 0.014*"outbreak" + 0.011*"public" + 0.011*"emerg"'), (5, '0.024*"infect" + 0.015*"calf" + 0.012*"mers-cov" + 0.010*"virus" + 0.010*"sever"'), (6, '0.037*"cell" + 0.020*"express" + 0.012*"mous" + 0.011*"increas" + 0.010*"immun"'), (7, '0.016*"diseas" + 0.015*"develop" + 0.013*"review" + 0.010*"research" + 0.010*"system"'), (8, '0.038*"vaccin" + 0.021*"antibodi" + 0.018*"virus" + 0.017*"immun" + 0.015*"strain"'), (9, '0.018*"model" + 0.011*"data" + 0.011*"use" + 0.011*"epidem" + 0.009*"popul"'), (10, '0.038*"detect" + 0.028*"virus" + 0.023*"test" + 0.022*"sampl" + 0.017*"assay"'), (11, '0.0

In [147]:
ldamodel.show_topics(num_topics=15, num_words=10, formatted=False)

[(0,
  [('patient', 0.03690815),
   ('case', 0.023391334),
   ('sar', 0.015882688),
   ('respiratori', 0.00851741),
   ('sever', 0.008492307),
   ('result', 0.008129095),
   ('transmiss', 0.008045725),
   ('clinic', 0.008037074),
   ('pneumonia', 0.007948288),
   ('outbreak', 0.007591929)]),
 (1,
  [('patient', 0.040227696),
   ('care', 0.022541983),
   ('mortal', 0.015562029),
   ('unit', 0.013499925),
   ('intens', 0.012445856),
   ('studi', 0.0114984345),
   ('ventil', 0.011095681),
   ('outcom', 0.010854739),
   ('critic', 0.010292985),
   ('acut', 0.009972063)]),
 (2,
  [('protein', 0.033602025),
   ('virus', 0.015358584),
   ('sequenc', 0.015258561),
   ('genom', 0.0132068405),
   ('viral', 0.011525483),
   ('structur', 0.010865143),
   ('gene', 0.008600501),
   ('function', 0.007009756),
   ('membran', 0.006881289),
   ('bind', 0.006694395)]),
 (3,
  [('virus', 0.03321224),
   ('influenza', 0.026445596),
   ('h1n1', 0.021181157),
   ('respiratori', 0.014148833),
   ('infect', 0.

In [154]:
test = ldamodel.show_topics(num_topics=15, num_words=10, formatted=False)[6][1]

In [155]:
sum = 0
for i in test:
    sum += i[1]

In [156]:
sum

0.13526788540184498