In [1]:
import pandas as pd
import numpy as np
import os, json
import nltk

In [2]:
DATAPATH = 'data'
CSV_FILEPATH ="/".join([DATAPATH, 'metadata_ft_subset.csv'])

In [3]:
df = pd.read_csv(CSV_FILEPATH)
print(df.shape)
df.head()

(33499, 19)


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,full_body_text
0,xqhn0vbp,1e1286db212100993d03cc22374b624f7caee956,PMC,Airborne rhinovirus detection and effect of ul...,10.1186/1471-2458-3-5,PMC140314,12525263.0,no-cc,"BACKGROUND: Rhinovirus, the most common cause ...",2003-01-13,"Myatt, Theodore A; Johnston, Sebastian L; Rudn...",BMC Public Health,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Rhinoviruses have been associated with 40% to ...
1,gi6uaa83,8ae137c8da1607b3a8e4c946c07ca8bda67f88ac,PMC,Discovering human history from stomach bacteria,10.1186/gb-2003-4-5-213,PMC156578,12734001.0,no-cc,Recent analyses of human pathogens have reveal...,2003-04-28,"Disotell, Todd R",Genome Biol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Charles Darwin recognized that the distributio...
3,fy4w7xz8,0104f6ceccf92ae8567a0102f89cbb976969a774,PMC,Association of HLA class I with severe acute r...,10.1186/1471-2350-4-9,PMC212558,12969506.0,no-cc,BACKGROUND: The human leukocyte antigen (HLA) ...,2003-09-12,"Lin, Marie; Tseng, Hsiang-Kuang; Trejaut, Jean...",BMC Med Genet,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,patient group a further significant increase ...
4,0qaoam29,5b68a553a7cbbea13472721cd1ad617d42b40c26,PMC,A double epidemic model for the SARS propagation,10.1186/1471-2334-3-19,PMC222908,12964944.0,no-cc,BACKGROUND: An epidemic of a Severe Acute Resp...,2003-09-10,"Ng, Tuen Wai; Turinici, Gabriel; Danchin, Antoine",BMC Infect Dis,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Since November 2002 (and perhaps earlier) an o...
5,qj4dh6rg,3ed670f60a7be2e3e2a991ea8af1fdd5fa5e2b2c,PMC,Cloaked similarity between HIV-1 and SARS-CoV ...,10.1186/1471-2180-3-20,PMC222911,14499001.0,no-cc,BACKGROUND: Severe acute respiratory syndrome ...,2003-09-21,"Kliger, Yossef; Levanon, Erez Y",BMC Microbiol,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,Infection by many enveloped viruses requires f...


In [4]:
def extract_simple_docs(df):
    docs = []
    for row in df.iterrows():
        abstract = row[1]['abstract']
        title = row[1]['title']
        if len(title) < 20:
            title = ''
        doc = title + ' ' + abstract
        docs.append(doc)
    
    return docs

In [5]:
docs = extract_simple_docs(df)
len(docs)

33499

In [6]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
# Split the documents into tokens.
def simple_preprocess(docs):
    new_docs = []
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        new_doc = docs[idx].lower()  # Convert to lowercase.
        new_doc = tokenizer.tokenize(new_doc)  # Split into words.
        new_doc = [token for token in new_doc if not token.isnumeric()]
        new_doc = [token for token in new_doc if len(token) > 1]
        new_doc = [token for token in new_doc if token not in stop_words]
        new_docs.append(new_doc)
    
    return new_docs

In [7]:
docs = simple_preprocess(docs)
print(len(docs))

33499


In [8]:
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [9]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 10987
Number of documents: 33499


<h3> The benchmark model uses all gensim defaults, including num_topics which I set for later use 

In [11]:
from gensim.models import LdaMulticore

# Set training parameters.
passes = 10
num_topics = 100

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    passes=passes,
    num_topics=num_topics
)

In [12]:
model.save('models/lda_abs_bench/model_v7')

In [13]:
top_topics = model.top_topics(texts=docs, dictionary=dictionary, coherence='c_v') #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: ', avg_topic_coherence)
rstd_atc = np.std([t[1] for t in top_topics]) / avg_topic_coherence
print('Relative Standard Deviation of ATC: ', rstd_atc)

Average topic coherence:  0.5357130370061749
Relative Standard Deviation of ATC:  0.2812682563676218


In [14]:
# Check a topic distribution

new_doc = docs[0]
doc_vector = dictionary.doc2bow(new_doc)
model[doc_vector]

[(10, 0.33024567),
 (14, 0.21849094),
 (19, 0.052338447),
 (37, 0.03096709),
 (50, 0.1140085),
 (64, 0.038208466),
 (88, 0.21003075)]

In [15]:
check_save = LdaMulticore.load('models/lda_abs_bench/model_v7')

In [16]:
check_topics = check_save.top_topics(texts=docs, dictionary=dictionary, coherence='c_v') #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in check_topics]) / num_topics
print('Average topic coherence: ', avg_topic_coherence)
rstd_atc = np.std([t[1] for t in check_topics]) / avg_topic_coherence
print('Relative Standard Deviation of ATC: ', rstd_atc)

Average topic coherence:  0.5357130370061749
Relative Standard Deviation of ATC:  0.2812682563676218


In [18]:
from gensim.models.nmf import Nmf
nmf = Nmf(
    corpus=corpus,
    id2word=id2word,
    passes=passes,
    num_topics=num_topics
)

<h3> NMF comes up better on baseline, but the numbers look similar, so that's promising

In [21]:
nmf_topics = nmf.top_topics(corpus=corpus, texts=docs, dictionary=dictionary, coherence='c_v') #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in nmf_topics]) / num_topics
print('Average topic coherence: ', avg_topic_coherence)
rstd_atc = np.std([t[1] for t in nmf_topics]) / avg_topic_coherence
print('Relative Standard Deviation of ATC: ', rstd_atc)

Average topic coherence:  0.5782634447700779
Relative Standard Deviation of ATC:  0.25009050078890793


In [22]:
nmf_topics

[([(0.04407656453181088, 'young'),
   (0.03666314717205842, 'park'),
   (0.027863293192796363, 'asthma'),
   (0.025509295570992647, 'children'),
   (0.023601538030535264, 'allergic'),
   (0.01641316150430664, 'min'),
   (0.013691464091810768, 'atopic'),
   (0.013522302097187378, 'ho'),
   (0.012714762740038337, 'dermatitis'),
   (0.012450672514630003, 'rhinitis'),
   (0.010867736937685543, 'cho'),
   (0.010284607897660825, 'allergy'),
   (0.009858494204091467, 'hong'),
   (0.009803042481553796, 'yang'),
   (0.008245015452272228, 'dust'),
   (0.008074400437628762, 'korean'),
   (0.007898177992390993, 'han'),
   (0.007195618979051587, 'ige'),
   (0.0064836323951990215, 'allergen'),
   (0.006340519921268014, 'immunotherapy')],
  0.9600756000366143),
 ([(0.12070746478254404, 'ibv'),
   (0.049148577827555, 'infectious'),
   (0.044790814724205326, 'bronchitis'),
   (0.03873720621334629, 'virus'),
   (0.0212652899830786, 'chickens'),
   (0.016393130894494443, 'strain'),
   (0.015800975012324,

In [24]:
# os.mkdir('models/nmf_abs_bench')
nmf.save('models/nmf_abs_bench/model_v7')