In [1]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(lda, support, confidence, bow):   
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow, min_support = support, min_confidence = confidence)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
      
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            consequences = list(ordered_stat.items_base)
            antecedent = list(ordered_stat.items_add)
            Support.append(RelationRecord.support)
            Antecedent.append(ordered_stat.items_base)
            Consequent.append(ordered_stat.items_add)
            Confidence.append(ordered_stat.confidence)
            Lift.append(ordered_stat.lift)
                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
    
    display(HTML(df.to_html()))

In [2]:
import gensim

def ngrams(words, minimumCount=5, threshold=100):
    bigram = gensim.models.Phrases(words, min_count=minimumCount, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=threshold)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod


In [3]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate):
    data = defaultdict(list)   
    listofarticles = []
    try:
        with open('metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue   
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

In [4]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        if not os.path.exists(json_path.replace(" ", "")):
            continue
        
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])             
                textBody.append(paragraph_text)
        text.append(paragraph_text)
    return text

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stop_words = stopwords.words('english')
    
    stop_words.extend(['correspond','author','submiss','includ','present','studi','editori','contribut','author',
                       'sole','email','financi','info','author','declar','follow','payment','servic','publicli',
                       'avail','activ','express','involv','relationship','twitter','multivari','analysi','journal',
                       'accuraci','obtain','train','perform','appli','preprocess','intern','optim','sequenc',
                       'posit','specif','use','version','manuscript','read','agre','publish','wrote','review',
                       'literatur','final','version','draft','read','approv','increas','mgkg','intraven','ivermectin',
                       'mycoplasma','ovi','administ','metaanalysi','meta','analysi','find','live','peer','preval',
                       'return','meaning','life','engag','hmitoxantron','hmitoxantron','hmitoxantron','mbamb','hepg',
                       'mcfa','note','also','street','imper','parp','people','pandemic','student','ttest','lipid',
                       'accord','anime','committe','research','univers','need','muscl','panel','shown','indic',
                       'control','larger','central','subset','sampl','peerreview','copyright','holder','receiv',
                       'extern','fund','spong','httpsdoiorg','upon','request','interpret','write','data','particip',
                       'month','deidentifi','platform','nation','institut','japan','bayesian','written','inform',
                       'consent','clinic','heparanas','american','umi','test','augment','preprint','post','juli',
                       'medrxiv','acquisit','import','intellectu','content','substanti','concept','design','revis',
                       'critic','workflow','system','consum','funder','age','year','restrict','cubic','spline',
                       'incid','trend','biopsi','start','results','mirna','silico','earli','laboratori','compani',
                       'monoclon','screen','target','valid','group','formal','methodolog','vaccin',
                       'term','time','nodul','worden','wurd','delay','licens','display','tmprss','right','number',
                       'social','perinat','csc','onlin','rappel','waterpip','work','office','total',
                       'confin','chair','hour','ecog','complet','disabl','cannot','carri','selfcar','light',
                       'sector','compet','interest','center','creativ','common','guidelin','properli',
                       'cite','england','rest','patient','treatment','breast','technic','effect','conflict','disclos',
                       'cell','cells','demograph','characterist','demonstr','pictur','boehring','ingelheim','angiotensinconvert',
                       'enzym','june','current','child','january','feburary','march','april','may','august','september',
                       'october','november','december','copi','licenc','visit','httpcreativecommonsorglicen',
                       'complianc','icmj','uniform','disclosur','ethic','hajj','doctor','analyz','mmoll','tabl'])
    
    resultDocs = []
    words = set(nltk.corpus.words.words())

    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):             
            lowerCasedWord = word.lower() 
            #check and see if lemmatizer can be extended
            lowerCasedWord.replace('sarscov', 'covid')
            lemmedWord = lemmatizer.lemmatize(lowerCasedWord)
                          
            if lemmedWord not in stop_words and lemmedWord not in "" and len(lemmedWord) > 3:
                stemmedWord = porter.stem(lemmedWord)
                if stemmedWord not in stop_words:
                    result.append(porter.stem(lemmedWord))
        resultDocs.append(result)
    return resultDocs

In [6]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):
    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
    
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

In [7]:
#re-evaluate the parameters here, mess with the hyper parameter(offset parameter)
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=10):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes,
                                                   eval_every = 1,
                                                   per_word_topics=True)

In [8]:
#change this to get only top topics
#for the coherence score read this later to get a better understand:
#  http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
#u_mass values seems like as it approaches 0, the score is better
# read this too: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
def GetTopicTerms(lda, corpus):
    bow = []
    for topic in lda.get_document_topics(corpus):
        transaction = []
        if not topic:
            continue
        for wordId in lda.get_topic_terms(topic[0][0], topn=20):
            #print(wordId)
            transaction.append(id2word[wordId[0]])
                
        bow.append(transaction) 
        
    return bow

In [9]:
def PrintTopicTerms(lda, corpus):
    for topic in lda.get_document_topics(corpus, minimum_probability=.002):
        print("Topic: ")
        for wordId in lda.get_topic_terms(topic[0][0], topn=20):
            print(id2word[wordId[0]])
                

In [20]:
def GetTopicTermsPerWord(lda, word, minProbablity = None):
    
    bow = []
    for key, value in id2word.items():
        porter = PorterStemmer()
        term = porter.stem(word)
        if value == term: 
            for topic in lda.get_term_topics(key, minimum_probability=minProbablity):
                transaction = []
                for wordId in lda.get_topic_terms(topic[0], topn=20):
                    #print(wordId)
                    transaction.append(id2word[wordId[0]])
                
                bow.append(transaction) 
        
    return bow

In [21]:
def PrintTopicTermsPerWord(lda, word, minProbablity = None):
    
    for key, value in id2word.items():
        porter = PorterStemmer()
        term = porter.stem(word)
        if value == term: 
            for topic in lda.get_term_topics(key, minimum_probability= minProbablity):
                print("Topic: ")
                for wordId in lda.get_topic_terms(topic[0], topn=20):
                    print(id2word[wordId[0]])


In [12]:
#split this up in a way that's easier to test, try splitting it up just a month at a time, find a way to visual topics
#per slice

import datetime
from pprint import pprint

listOfpdfs= GetData(datetime.date(2020, 3, 1), datetime.date(2020, 4, 1))

if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
    
print('Amount of pdfs gathered: ', len(listOfpdfs))

Amount of pdfs gathered:  2273


In [13]:
data = GetTextBodies(listOfpdfs)

In [14]:
cleaned_data = CleanTheData(data)    

In [15]:
corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data)

lda = GetLDAModel(corpus, id2word, dictionary, numberOfTopics = 500,chunkSize=2000,passes=10)

In [16]:
bow = GetTopicTerms(lda, corpus)
print(bow)

[['variant', 'induc', 'cidr', 'respons', 'antigen', 'protein', 'multipl', 'combin', 'immun', 'epcrbind', 'potent', 'belong', 'show', 'clarifi', 'mani', 'futur', 'suggest', 'differ', 'previou', 'capabl'], ['repres', 'promot', 'strategi', 'genet', 'antigen', 'mrna', 'instead', 'novel', 'associ', 'must', 'immun', 'might', 'inhibit', 'tumour', 'immunogen', 'diseas', 'determin', 'structur', 'differ', 'therapi'], ['bias', 'holist', 'product', 'adjust', 'medic', 'improv', 'invest', 'allow', 'qualiti', 'across', 'develop', 'amino_acid', 'approach', 'digit', 'public', 'suggest', 'pvalu', 'could', 'normal', 'diseas'], ['safe', 'intervent', 'provid', 'youth', 'quantum', 'carbon', 'maxim', 'investig', 'pathogen', 'administr', 'outcom', 'power', 'process', 'staff', 'adher', 'addit', 'among', 'improv', 'safeti', 'signific'], ['thank', 'strive', 'report', 'case', 'like', 'help', 'outbreak', 'possibl', 'develop', 'initi', 'covid', 'role', 'public', 'health', 'level', 'epidemiolog', 'relat', 'github', 

In [17]:
PrintRuleAssociation(lda, support=0.04, confidence= 0.9, bow=bow)

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
136,"{tool, health, respons}","{implement, public}",0.041646,1.0,24.011905
101,"{implement, public}","{tool, respons}",0.041646,1.0,24.011905
131,"{tool, respons}","{implement, public, health}",0.041646,1.0,24.011905
132,"{implement, health, public}","{tool, respons}",0.041646,1.0,24.011905
135,"{tool, health, public}","{implement, respons}",0.041646,1.0,24.011905
128,"{implement, respons}","{tool, public, health}",0.041646,1.0,24.011905
127,"{implement, public}","{tool, health, respons}",0.041646,1.0,24.011905
105,"{tool, respons}","{implement, public}",0.041646,1.0,24.011905
140,"{tool, public, respons}","{implement, health}",0.041646,1.0,23.453488
125,"{implement, health}","{tool, public, respons}",0.041646,0.976744,23.453488


In [31]:
bow = GetTopicTermsPerWord(lda, 'diabetes', 0.0000002)
print(bow)

[['view', 'inclus', 'gener', 'observ', 'show', 'tune', 'model', 'dataset', 'miss', 'qualiti', 'classif', 'improv', 'condit', 'pain', 'rate', 'expedi', 'approach', 'achiev', 'compar', 'three'], ['covid', 'thiazolidin', 'driver', 'synthesi', 'hybrid', 'like', 'anim', 'transmiss', 'ncov', 'futur', 'hold', 'vehicl', 'emerg', 'speci', 'field', 'differ', 'cleavag', 'mitig', 'epidem', 'case'], ['sarscov', 'infect', 'uncertain', 'often', 'frequent', 'drug', 'limit', 'therapeut', 'adult', 'requir', 'viru', 'symptom', 'measur', 'sever', 'allergi', 'covid', 'develop', 'influenza', 'particular', 'howev'], ['empagliflozin', 'hfref', 'heart_failur', 'improv', 'model', 'experiment', 'myocardi_infarct', 'reduc', 'function', 'injuri', 'confirm', 'mgkgday', 'diabet', 'cardiac', 'attenu', 'eject', 'hemodynam', 'ischemia', 'outcom', 'highlight'], ['covid', 'lung', 'hypertens', 'angiotensin', 'type', 'receptor', 'convent', 'kidney', 'risk', 'acei', 'ard', 'secret', 'advers', 'treat', 'liver', 'inhibitor', 

In [33]:
PrintRuleAssociation(lda, support=0.2, confidence= 0.9, bow=bow)

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift
82,"{protect, diseas}","{associ, infect}",0.25,1.0,4.0
91,"{associ, risk}","{infect, diseas}",0.25,1.0,4.0
111,"{associ, infect}","{protect, risk}",0.25,1.0,4.0
110,{protect},"{associ, risk, infect}",0.25,1.0,4.0
109,{associ},"{infect, protect, risk}",0.25,1.0,4.0
108,"{protect, risk, diseas}",{associ},0.25,1.0,4.0
106,"{associ, risk, diseas}",{protect},0.25,1.0,4.0
104,"{protect, risk}","{associ, diseas}",0.25,1.0,4.0
103,"{protect, diseas}","{associ, risk}",0.25,1.0,4.0
102,"{associ, risk}","{protect, diseas}",0.25,1.0,4.0


In [34]:
#testing out the other coherence calculations, this value should be positive and the closer to 1 the better
# according to https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
# but need to confirm
lda.top_topics(texts=cleaned_data, topn=5, coherence='c_v', processes=-1)

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


[([(0.045435898, 'site'),
   (0.04381551, 'disulfid'),
   (0.038308125, 'hneil'),
   (0.021934073, 'inhibitor'),
   (0.021835381, 'trimer')],
  0.9559453731591374),
 ([(0.09383874, 'pressur'),
   (0.043056224, 'measur'),
   (0.034763675, 'anaesthesia'),
   (0.028929885, 'intracuff'),
   (0.02318091, 'cuff')],
  0.8579983213164002),
 ([(0.00034196093, 'residu'),
   (0.00032689504, 'viru'),
   (0.00030991255, 'hotspot'),
   (0.00029715177, 'stabil'),
   (0.00028488724, 'capsid')],
  0.8223580283453824),
 ([(0.09889201, 'variant'),
   (0.081957154, 'induc'),
   (0.05164739, 'cidr'),
   (0.03100418, 'respons'),
   (0.030990833, 'antigen')],
  0.7796552343724751),
 ([(0.0011196949, 'incub_period'),
   (0.0008325699, 'wuhan'),
   (0.00042963188, 'infect'),
   (0.00036356246, 'cohort'),
   (0.0003428571, 'possibl')],
  nan),
 ([(0.02632182, 'infect'),
   (0.024758784, 'trust'),
   (0.015084242, 'discontinu'),
   (0.015065322, 'graphpad_prism'),
   (0.0130221695, 'stori')],
  nan),
 ([(0.05948

In [35]:
lda.top_topics(corpus=corpus, topn=5, processes=5)

[([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(8.982305e-05, 'enlarg'),
   (8.982305e-05, 'camera'),
   (8.982305e-05, 'lfa'),
   (8.982305e-05, 'flir'),
   (8.982305e-05, 'optic')],
  2.273000007849915e-09),
 ([(

In [None]:
PrintTopicTerms(lda, corpus)

In [None]:
PrintTopicTermsPerWord(lda, 'diabetes', 0.0000002)