# COVID-19 Research Papers LDA Clustering

In [1]:
import numpy as np
import pandas as pd
import json
import itertools
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import nltk
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

import re

import gensim
from gensim import corpora, models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jayfeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jayfeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
unable to import 'smart_open.gcs', disabling that module


In [2]:
"""Reads in abstracts.csv and filters out rows with missing values."""

df = pd.read_csv("abstracts.csv")
df = df[df["abstract"] != "NaN"]
df = df.dropna()

In [3]:
# def tokenize_func(row):
#     return word_tokenize(row[2])

In [4]:
# df["abstract tokens"] = df.apply(lambda row: tokenize_func(row), axis=1)

In [5]:
# lemmatizer = WordNetLemmatizer() 

# def lemmatize_func(row):
#     counter = 0
#     while counter < len(row[3]):
#         row[3][counter] = lemmatizer.lemmatize(row[3][counter])
#         counter += 1

In [6]:
# df["abstract tokens"] = df.apply(lambda row: tokenize_func(row), axis=1)

In [7]:
stop_words = set(stopwords.words('english')) 
snowBallStemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [8]:
sample_abstract = 'Abstract Middle-aged female identical twins, one of whom had systemic lupus erythematosus (SLE), were evaluated for immunologic reactivity to previous antigenic challenges, including primary immunization with a foreign antigen, keyhole limpet hemocyanin (KLH). These two women had lived together for all of their 58 years and neither was receiving anti-inflammatory or immunosuppressive drugs at the time of these studies. Both twins demonstrated comparable 7S and 198 humoral antibody response to KLH, as well as similar viral antibody titers. However, the twin with SLE was anergic to common antigens, streptokinase-streptodornase, Trichophyton and Candida; furthermore delayed hypersensitivity to KLH did not develop after immunization. This observed discrepancy between humoral and cellular immunity in genetically similar subjects may be significant in the pathogenesis of SLE.'

In [9]:
def tokenize_clean(abstract):
    #tokenizes abstract string
    tokens = word_tokenize(abstract.lower())
    
    #lemmatizes tokens
    counter = 0
    while counter < len(tokens):
        tokens[counter] = lemmatizer.lemmatize(tokens[counter])
        counter += 1
    
    #filters, stems, and lowercases tokens
    filtered_tokens = []
    for i in tokens:
        if i not in stop_words and len(i) > 3 and i != "abstract":
            stemmed_word = snowBallStemmer.stem(i)
            filtered_tokens.append(stemmed_word)
    
    return filtered_tokens

In [10]:
df["abstract tokens"] = df.apply(lambda row: tokenize_clean(row.abstract), axis=1)

In [11]:
df

Unnamed: 0.1,Unnamed: 0,sha,abstract,abstract tokens
0,3,aecbc613ebdab36753235197ffb4f35734b5ca63,"Abstract Middle-aged female identical twins, o...","[middle-ag, femal, ident, twin, system, lupus,..."
1,5,212e990b378e8d267042753d5f9d4a64ea5e9869,Abstract Our understanding of the pathogenesis...,"[understand, pathogenesi, infecti, especi, bac..."
2,6,bf5d344243153d58be692ceb26f52c08e2bd2d2f,Abstract In the pathogenesis of rheumatoid art...,"[pathogenesi, rheumatoid, arthriti, local, pro..."
3,7,ddd2ecf42ec86ad66072962081e1ce4594431f9c,"Abstract Pharyngitis, bronchitis, and pneumoni...","[pharyng, bronchiti, pneumonia, repres, common..."
4,8,a55cb4e724091ced46b5e55b982a14525eea1c7e,"Abstract Acute bronchitis, an illness frequent...","[acut, bronchiti, ill, frequent, encount, prim..."
...,...,...,...,...
27685,40508,179df1e769292dd113cef1b54b0b43213e6b5c97,"Background/introduction COVID−19, a novel coro...","[background/introduct, covid−19, novel, corona..."
27686,40509,9b4445849937393a4b05378653521a9d0c34dc8e,Governments around the world must rapidly mobi...,"[govern, around, world, must, rapid, mobil, ma..."
27687,40510,4e618ec5d2edea031a9ff8058a9bafafe30937be,The 2019-Novel-Coronavirus (COVID-19) has affe...,"[2019-novel-coronavirus, covid-19, affect, cou..."
27688,40511,28b53e0cab53b10ab87431d6cc4ac1e0a7c4d6b9,Object Meteorological parameters are the impor...,"[object, meteorolog, paramet, import, factor, ..."


In [None]:
# texts = []
# for index, row in df.iterrows():
#     texts.append(row[3])
    

In [None]:
# dictionary = corpora.Dictionary(texts)

# corpus = [dictionary.doc2bow(text) for text in texts]

# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

In [None]:
#print(ldamodel.print_topics(num_topics=20, num_words=5))

In [None]:
[(0, '0.044*"hcov" + 0.029*"measl" + 0.026*"facial" + 0.021*"hcov-nl63" + 0.018*"rhiniti"'),
 (1, '0.080*"virus" + 0.047*"infect" + 0.034*"respiratori" + 0.030*"viral" + 0.020*"detect"'),
 (2, '0.052*"cell" + 0.031*"infect" + 0.021*"respons" + 0.019*"express" + 0.019*"immun"'),
 (3, '0.165*"januari" + 0.094*"decemb" + 0.022*"affili" + 0.013*"aviat" + 0.011*"thailand"'),
 (4, '0.075*"patient" + 0.026*"sever" + 0.026*"clinic" + 0.020*"diseas" + 0.015*"treatment"'),
 (5, '0.040*"activ" + 0.037*"antivir" + 0.028*"effect" + 0.026*"drug" + 0.021*"inhibit"'),
 (6, '0.051*"model" + 0.017*"number" + 0.017*"epidem" + 0.013*"popul" + 0.013*"estim"'),
 (7, '0.042*"sequenc" + 0.030*"gene" + 0.028*"strain" + 0.027*"genom" + 0.018*"analysi"'),
 (8, '0.022*"diseas" + 0.014*"review" + 0.013*"develop" + 0.011*"pathogen" + 0.011*"system"'),
 (9, '0.035*"use" + 0.033*"detect" + 0.028*"assay" + 0.025*"method" + 0.024*"test"'),
 (10, '0.093*"pedv" + 0.073*"porcin" + 0.046*"swine" + 0.043*"diarrhea" + 0.043*"piglet"'),
 (11, '0.028*"health" + 0.024*"outbreak" + 0.016*"public" + 0.014*"china" + 0.014*"diseas"'),
 (12, '0.088*"korea" + 0.051*"felin" + 0.019*"lesion" + 0.015*"spectrometri" + 0.014*"periton"'),
 (13, '0.090*"vaccin" + 0.056*"antibodi" + 0.036*"immun" + 0.021*"protect" + 0.019*"respons"'),
 (14, '0.039*"case" + 0.033*"hospit" + 0.028*"patient" + 0.020*"rate" + 0.013*"infect"'),
 (15, '0.034*"studi" + 0.029*"use" + 0.020*"data" + 0.019*"method" + 0.018*"result"'),
 (16, '0.041*"protein" + 0.024*"virus" + 0.015*"cell" + 0.015*"viral" + 0.011*"structur"'),
 (17, '0.087*"temperatur" + 0.060*"heat" + 0.044*"inactiv" + 0.028*"skin" + 0.027*"humid"'),
 (18, '0.044*"group" + 0.025*"blood" + 0.016*"calf" + 0.016*"signific" + 0.015*"level"'),
 (19, '0.072*"influenza" + 0.022*"pandem" + 0.021*"particip" + 0.019*"health" + 0.016*"work"')]





In [19]:
partial_df = df#.sample(10000)

In [20]:
partial_texts = []
for index, row in partial_df.iterrows():
    partial_texts.append(row[3])

In [21]:
dictionary = corpora.Dictionary(partial_texts)
dictionary.filter_extremes(no_below=3)


corpus = [dictionary.doc2bow(text) for text in partial_texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=50)


In [22]:
#print(ldamodel.print_topics(num_topics=15, num_words=5))

In [63]:
lda_results = ldamodel.show_topics(num_topics=10, num_words=15, formatted=False)
word_pairs = lda_results[2][1]
word_pairs[2][0]
lda_results

[(0,
  [('detect', 0.033920556),
   ('sampl', 0.027166747),
   ('assay', 0.019891832),
   ('use', 0.01736933),
   ('virus', 0.016513366),
   ('test', 0.01598823),
   ('sensit', 0.011024779),
   ('pedv', 0.010967567),
   ('result', 0.010411655),
   ('infect', 0.010170441),
   ('method', 0.009833486),
   ('specif', 0.0096131945),
   ('calf', 0.009167769),
   ('serum', 0.009100203),
   ('antibodi', 0.0090053165)]),
 (1,
  [('virus', 0.053980086),
   ('infect', 0.05109838),
   ('cell', 0.043939605),
   ('viral', 0.035461344),
   ('replic', 0.022380788),
   ('antivir', 0.017703585),
   ('activ', 0.014894153),
   ('inhibit', 0.012344222),
   ('host', 0.011745705),
   ('effect', 0.008913329),
   ('express', 0.008135327),
   ('induc', 0.006751869),
   ('result', 0.006548479),
   ('entri', 0.0062152296),
   ('studi', 0.0061823665)]),
 (2,
  [('protein', 0.047007475),
   ('structur', 0.013329896),
   ('bind', 0.011568938),
   ('use', 0.008860905),
   ('sars-cov', 0.008660932),
   ('domain', 0.00

In [67]:
topic_words = []
#i is each topic
for i in lda_results:
    topic_i_words = []
    word_pairs = i[1]
    #j is the list of word-probability pairs (we don't care about the probabilities here)
    for j in word_pairs:
        topic_i_words.append(j[0])
    topic_words.append(topic_i_words)
        
#topic_words   

In [69]:
a = [0] * 10
a

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [73]:
partial_df["abstract tokens"]

0        [middle-ag, femal, ident, twin, system, lupus,...
1        [understand, pathogenesi, infecti, especi, bac...
2        [pathogenesi, rheumatoid, arthriti, local, pro...
3        [pharyng, bronchiti, pneumonia, repres, common...
4        [acut, bronchiti, ill, frequent, encount, prim...
                               ...                        
27685    [background/introduct, covid−19, novel, corona...
27686    [govern, around, world, must, rapid, mobil, ma...
27687    [2019-novel-coronavirus, covid-19, affect, cou...
27688    [object, meteorolog, paramet, import, factor, ...
27689    [report, tempor, pattern, viral, shed, laborat...
Name: abstract tokens, Length: 24045, dtype: object

In [77]:
partial_df

assigned_topic = []
for index, row in partial_df.iterrows():
    tokens = row["abstract tokens"]
    counter_array = [0] * 15
    for i in tokens:
        for j in np.arange(10):
            if i in topic_words[j]:
                counter_array[j] += 1
    max_topic = counter_array.index(max(counter_array))
    assigned_topic.append(max_topic)
                
assigned_topic

[4,
 4,
 5,
 6,
 6,
 2,
 3,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 2,
 2,
 1,
 1,
 5,
 8,
 0,
 2,
 8,
 1,
 7,
 2,
 7,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 5,
 2,
 1,
 8,
 1,
 1,
 2,
 2,
 1,
 1,
 4,
 4,
 7,
 1,
 2,
 7,
 7,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 6,
 0,
 4,
 4,
 0,
 0,
 0,
 0,
 0,
 4,
 4,
 4,
 2,
 2,
 1,
 5,
 1,
 7,
 1,
 1,
 8,
 2,
 2,
 2,
 2,
 2,
 3,
 1,
 1,
 2,
 0,
 0,
 4,
 4,
 4,
 8,
 1,
 3,
 3,
 0,
 6,
 4,
 0,
 4,
 3,
 5,
 4,
 6,
 4,
 6,
 1,
 0,
 4,
 3,
 1,
 1,
 3,
 1,
 4,
 1,
 8,
 1,
 2,
 0,
 1,
 2,
 2,
 7,
 8,
 1,
 1,
 8,
 2,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 1,
 8,
 8,
 1,
 4,
 1,
 1,
 2,
 2,
 8,
 1,
 1,
 2,
 0,
 0,
 4,
 1,
 0,
 4,
 8,
 1,
 8,
 0,
 1,
 8,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 2,
 0,
 1,
 1,
 8,
 8,
 8,
 0,
 8,
 5,
 1,
 0,
 1,
 2,
 8,
 2,
 5,
 1,
 8,
 8,
 8,
 2,
 2,
 4,
 4,
 2,
 8,
 2,
 5,
 1,
 1,
 1,
 4,
 6,
 2,
 2,
 2,
 8,
 2,
 2,
 1,
 1,
 8,
 1,
 0,
 8,
 1,
 2,
 1,
 8,
 8,
 2,
 2,
 2,
 2,
 8,
 4,
 2,
 8,
 8,
 2,
 8,
 2,
 2,
 2,
 2,
 2,
 8,


In [78]:
len(assigned_topic)

24045

In [81]:
partial_df["assigned topic"] = assigned_topic

In [93]:
partial_df["abstract"][26]

'Abstract This study describes the isolation and characterization of an influenza virus subtype H3N2 designated A/Swine/Weybridge/163266/87. The virus was isolated from a severe outbreak of respiratory disease in East Anglia. Haemagglutinin and neuraminidase characterization showed the virus to be very similar to H3N2 strains circulating in the human population during the years 1972–1975, and to H3N2 strains recently isolated from pigs in Belgium and France. A serological survey showed antibodies to the virus to be present in 31% of pigs tested, and reactors were detected on 43% of farms sampled.'

In [92]:
partial_df["assigned topic"][26]

8