# COVID-19 Research Papers LDA Clustering

In [1]:
import numpy as np
import pandas as pd
import json
import itertools
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import nltk
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

import re

import gensim
from gensim import corpora, models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jayfeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jayfeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
unable to import 'smart_open.gcs', disabling that module


In [2]:
"""Reads in abstracts.csv and filters out rows with missing values."""

df = pd.read_csv("abstracts.csv")
df = df[df["abstract"] != "NaN"]
df = df.dropna()

In [35]:
"""Set up stop words, stemmer, and lemmatizer."""

stop_words = set(stopwords.words('english')) 
snowBallStemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [8]:
"""Tokenize and clean the abstracts of every paper."""

def tokenize_clean(abstract):
    #tokenizes abstract string
    tokens = word_tokenize(abstract.lower())
    
    #lemmatizes tokens
    counter = 0
    while counter < len(tokens):
        tokens[counter] = lemmatizer.lemmatize(tokens[counter])
        counter += 1
    
    #filters, stems, and lowercases tokens
    filtered_tokens = []
    for i in tokens:
        if i not in stop_words and len(i) > 3 and i != "abstract":
            stemmed_word = snowBallStemmer.stem(i)
            filtered_tokens.append(stemmed_word)
    
    return filtered_tokens

df["abstract tokens"] = df.apply(lambda row: tokenize_clean(row.abstract), axis=1)

In [15]:
"""Perform LDA topic modelling on a sample of the papers for speed purposes."""

partial_df = df.sample(3000)

In [37]:
"""Create a list of lists of cleaned tokens of abstracts"""

partial_texts = []
for index, row in partial_df.iterrows():
    partial_texts.append(row[3])

In [17]:
"""Use gensim package to perform LDA topic modelling.
Code from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
used for reference.
"""

dictionary = corpora.Dictionary(partial_texts)
dictionary.filter_extremes(no_below=3)

corpus = [dictionary.doc2bow(text) for text in partial_texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=50)

In [44]:
lda_results = ldamodel.show_topics(num_topics=10, num_words=15, formatted=False)
#lda_results

In [40]:
"""Create topic_words, a list of buckets of words that represent each topic."""

topic_words = []
#i is each topic
for i in lda_results:
    topic_i_words = []
    word_pairs = i[1]
    #j is the list of word-probability pairs (we don't care about the probabilities here)
    for j in word_pairs:
        topic_i_words.append(j[0])
    topic_words.append(topic_i_words)
        
topic_words   

[['health',
  'outbreak',
  'public',
  'care',
  'diseas',
  'case',
  'countri',
  'china',
  'emerg',
  'risk',
  'studi',
  'sar',
  'control',
  'report',
  'infect'],
 ['patient',
  'respiratori',
  'infect',
  'virus',
  'influenza',
  'child',
  'case',
  'studi',
  'clinic',
  'sever',
  'viral',
  'pneumonia',
  'hospit',
  'detect',
  'acut'],
 ['protein',
  'structur',
  'virus',
  'domain',
  'bind',
  'activ',
  'viral',
  'membran',
  'interact',
  'cell',
  'function',
  'studi',
  'acid',
  'target',
  'coronavirus'],
 ['vaccin',
  'virus',
  'antibodi',
  'immun',
  'respons',
  'influenza',
  'infect',
  'antigen',
  'protect',
  'neutral',
  'human',
  'epitop',
  'develop',
  'challeng',
  'high'],
 ['diseas',
  'cancer',
  'regul',
  'inflamm',
  'inflammatori',
  'process',
  'chronic',
  'system',
  'factor',
  'model',
  'viral',
  'mechan',
  'role',
  'infect',
  'review'],
 ['diseas',
  'develop',
  'review',
  'potenti',
  'research',
  'effect',
  'includ'

In [49]:
"""Assign a topic to each of the papers."""

assigned_topic = []
for index, row in partial_df.iterrows():
    tokens = row["abstract tokens"]
    counter_array = [0] * 15
    for i in tokens:
        for j in np.arange(10):
            if i in topic_words[j]:
                counter_array[j] += 1
    max_topic = counter_array.index(max(counter_array))
    assigned_topic.append(max_topic)
                
partial_df["assigned topic"] = assigned_topic
partial_df = partial_df.sort_values("assigned topic")
partial_df

Unnamed: 0.1,Unnamed: 0,sha,abstract,abstract tokens,assigned topic
15651,41231,d9e4c9b6b809ddc1f9dc8787f77368334e1e538b,Abstract Introduction Sources describing the g...,"[introduct, sourc, describ, global, burden, em...",0
1390,2170,29e0c4a7f3f8e2bd4a8f50fae12cc31f1a863763,"Surveillance is the ongoing, systematic collec...","[surveil, ongo, systemat, collect, analysi, in...",0
14806,28624,eaca17432584c7f2ecbb17e611df70deed0dbec3,Since the World Health Organization declared t...,"[sinc, world, health, organ, declar, global, o...",0
12915,16518,ccdc714272fd3392147edfbc7bf0731811c2b674,Summary This study describes a loophole in the...,"[summari, studi, describ, loophol, intern, qua...",0
25734,40820,5794890e355a8abbec51b46d767b6ee67edcd274,The first human Zika virus (ZIKV) outbreak was...,"[first, human, zika, virus, zikv, outbreak, re...",0
...,...,...,...,...,...
15610,41114,f6525bc180629aca4b0760e0ff9f187f2e2a6fec,Abstract Swine acute diarrhea syndrome coronav...,"[swine, acut, diarrhea, syndrom, coronavirus, ...",9
16232,42779,616525aa865b829ccba0cc30c39be871bbbe7a18,Abstract Respiratory syncytial virus (RSV) is ...,"[respiratori, syncyti, virus, lead, caus, lowe...",9
21285,22207,cbe56b09d64047cba4ee7875c4f55276a0cdf273,BACKGROUND: We previously reported that Entero...,"[background, previous, report, enterovirus, ev...",9
22953,23901,d18636f47e3c7dd93da309d556ba464d964fd24f,"Hantavirus infection, which causes zoonotic di...","[hantavirus, infect, caus, zoonot, diseas, hig...",9
