# COVID-19 Research Papers LDA Topic Modeling

Use the LDA model to create topics, or clusters, present in the dataset using the abstracts of the papers. Each topic is corresponds to a set of word-probability pairs, and I chose to use the top 15 highest probability words of each topic to represent the given topic. Each paper is assigned a topic based on how many of the top 15 words of each topic are included in the abstract, and how often.

In [2]:
import numpy as np
import pandas as pd
import json
import itertools
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import nltk
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
nltk.download('stopwords')
from nltk.tokenize import word_tokenize 
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

import re

import gensim
from gensim import corpora, models

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jayfeng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jayfeng/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jayfeng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
unable to import 'smart_open.gcs', disabling that module


In [3]:
"""Reads in abstracts.csv and filters out rows with missing values."""

df = pd.read_csv("abstracts.csv")
df = df[df["abstract"] != "NaN"]
df = df.dropna()

In [4]:
"""Set up stop words, stemmer, and lemmatizer."""

stop_words = set(stopwords.words('english')) 
snowBallStemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [5]:
"""Tokenize and clean the abstracts of every paper."""

def tokenize_clean(abstract):
    #tokenizes abstract string
    tokens = word_tokenize(abstract.lower())
    
    #lemmatizes tokens
    counter = 0
    while counter < len(tokens):
        tokens[counter] = lemmatizer.lemmatize(tokens[counter])
        counter += 1
    
    #filters, stems, and lowercases tokens
    filtered_tokens = []
    for i in tokens:
        if i not in stop_words and len(i) > 3 and i != "abstract":
            stemmed_word = snowBallStemmer.stem(i)
            filtered_tokens.append(stemmed_word)
    
    return filtered_tokens

df["abstract tokens"] = df.apply(lambda row: tokenize_clean(row.abstract), axis=1)

In [6]:
"""Perform LDA topic modelling on a sample of the papers for speed purposes."""

partial_df = df#.sample(3000)

In [7]:
"""Create a list of lists of cleaned tokens of abstracts"""

partial_texts = []
for index, row in partial_df.iterrows():
    partial_texts.append(row[3])

In [8]:
"""Use gensim package to perform LDA topic modelling.
Code from https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
used for reference.
"""

dictionary = corpora.Dictionary(partial_texts)
dictionary.filter_extremes(no_below=3)

corpus = [dictionary.doc2bow(text) for text in partial_texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=50)

In [14]:
lda_results = ldamodel.show_topics(num_topics=10, num_words=15, formatted=False)
lda_results

[(0,
  [('cell', 0.025012113),
   ('diseas', 0.012629243),
   ('mous', 0.0115058),
   ('lung', 0.010627538),
   ('level', 0.0105229085),
   ('express', 0.010188541),
   ('respons', 0.009969395),
   ('increas', 0.008813089),
   ('immun', 0.008296855),
   ('patient', 0.008231886),
   ('studi', 0.00761007),
   ('cytokin', 0.0069219493),
   ('effect', 0.0066934),
   ('activ', 0.0062856604),
   ('signific', 0.006209401)]),
 (1,
  [('use', 0.02553227),
   ('detect', 0.015486624),
   ('method', 0.015484636),
   ('assay', 0.014352027),
   ('develop', 0.01120945),
   ('test', 0.01083256),
   ('sensit', 0.010483696),
   ('specif', 0.008285644),
   ('result', 0.007763063),
   ('studi', 0.006763329),
   ('sampl', 0.006745366),
   ('compound', 0.0064180945),
   ('drug', 0.006367256),
   ('effect', 0.006135174),
   ('evalu', 0.005772998)]),
 (2,
  [('health', 0.016095055),
   ('covid-19', 0.014743369),
   ('diseas', 0.014242308),
   ('model', 0.00953684),
   ('outbreak', 0.0085782325),
   ('public',

In [10]:
"""Create topic_words, a list of buckets of words that represent each topic."""

topic_words = []
#i is each topic
for i in lda_results:
    topic_i_words = []
    word_pairs = i[1]
    #j is the list of word-probability pairs (we don't care about the probabilities here)
    for j in word_pairs:
        topic_i_words.append(j[0])
    topic_words.append(topic_i_words)
        
topic_words   

[['cell',
  'diseas',
  'mous',
  'lung',
  'level',
  'express',
  'respons',
  'increas',
  'immun',
  'patient',
  'studi',
  'cytokin',
  'effect',
  'activ',
  'signific'],
 ['use',
  'detect',
  'method',
  'assay',
  'develop',
  'test',
  'sensit',
  'specif',
  'result',
  'studi',
  'sampl',
  'compound',
  'drug',
  'effect',
  'evalu'],
 ['health',
  'covid-19',
  'diseas',
  'model',
  'outbreak',
  'public',
  'transmiss',
  'control',
  'epidem',
  'spread',
  'emerg',
  'data',
  'popul',
  'infecti',
  'develop'],
 ['patient',
  'respiratori',
  'infect',
  'virus',
  'clinic',
  'sever',
  'hospit',
  'covid-19',
  'influenza',
  'child',
  'studi',
  'case',
  'detect',
  'viral',
  'pneumonia'],
 ['group',
  'infect',
  'sampl',
  'studi',
  'blood',
  'calf',
  'signific',
  'diarrhea',
  'anim',
  'diseas',
  'detect',
  'associ',
  'temperatur',
  'control',
  'test'],
 ['case',
  'china',
  'rate',
  'wuhan',
  'estim',
  'number',
  'result',
  'use',
  'model'

In [11]:
"""Assign a topic to each of the papers."""

assigned_topic = []
for index, row in partial_df.iterrows():
    tokens = row["abstract tokens"]
    counter_array = [0] * 15
    for i in tokens:
        for j in np.arange(10):
            if i in topic_words[j]:
                counter_array[j] += 1
    max_topic = counter_array.index(max(counter_array))
    assigned_topic.append(max_topic)
                
partial_df["assigned topic"] = assigned_topic
partial_df = partial_df.sort_values("assigned topic")
partial_df

Unnamed: 0.1,Unnamed: 0,sha,abstract,abstract tokens,assigned topic
9748,12331,f27a5562dd776c3a927ef078b0038ac690d03d90,Abstract Emergency departments play a critical...,"[emerg, depart, play, critic, role, public, he...",0
4693,6316,9862f8f952ee3c06f71abde040191057aae32175,"Abstract This study assesses viremia, provirus...","[studi, viremia, provirus, blood, cytokin, pro...",0
13388,17085,613b280bd1f7e0a0dd50cbf2501da003caf95eb4,Abstract The cough reflex is an attack of powe...,"[cough, reflex, attack, power, expiratori, eff...",0
27091,39777,9c32d461dc9d4737756a990cf13bae1a03e078a9,The respiratory tract surface is protected fro...,"[respiratori, tract, surfac, protect, inhal, p...",0
13384,17080,b67c1adb9815a8ac0b118d1bd2f563d0d0e7c2bb,Publisher Summary Kawasaki disease (KD) is an ...,"[publish, summari, kawasaki, diseas, acut, feb...",0
...,...,...,...,...,...
15093,28926,868afcaa176cdfdc50900313a5657583d5a74e9e,"In Ohio, United States, in early 2014, a delta...","[ohio, unit, state, earli, 2014, deltacoronavi...",9
15066,28898,8ccaf50414e8f530aaa405630c4e477d377d09ce,The complete genome of hepatitis E virus (HEV)...,"[complet, genom, hepat, virus, laboratori, fer...",9
15048,28879,3399d0fe01c7cb8bff0615a506b7beacc813a05e,Our understanding of human disease and potenti...,"[understand, human, diseas, potenti, therapeut...",9
25659,40632,06989a9659f1b9b10abc5b92a90ecff38a778d55,"An old world fruit bat Pteropus giganteus, hel...","[world, fruit, pteropus, giganteus, held, capt...",9


In [13]:
partial_df.to_csv("document_clusters.csv")