In [1]:
#I had to install nltk to get this project to work. I might have downloaded something for
#I think I used this in jupyter notebook: nltk.download('stopwords')
import pandas as pd
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime

def PrintRuleAssociation(word, maxNumberOfAssociations = 2):
    bow = []
    
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    severe = "severe"
    severe = lemmatizer.lemmatize(severe)
    severe = porter.stem(severe)
    
    covid = "covid"
    covid = lemmatizer.lemmatize(covid)
    covid = porter.stem(covid)
    
    sarscov = "sarscov"
    
    sarscov = lemmatizer.lemmatize(sarscov)
    sarscov = porter.stem(sarscov)
      
    word = lemmatizer.lemmatize(word)
    word = porter.stem(word)
    
    infect= "infect"
    infect = lemmatizer.lemmatize(infect)
    infect = porter.stem(infect)
    
    bow = GetTopicTerms(lda, word) 
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    print('amount of topics for ', word, ": ", len(bow))
    rules = apriori(bow, min_support = 0.01, min_confidence = 0.7, max_length = maxNumberOfAssociations)
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side','Right Hand Side','Support','Confidence','Lift'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
      
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            consequences = list(ordered_stat.items_base)
            antecedent = list(ordered_stat.items_add)
            if ((covid in consequences or covid in antecedent) or (sarscov in consequences or sarscov in antecedent) or (infect in consequences or infect in antecedent)) \
            and (word in consequences or word in antecedent) \
            and (severe in consequences or severe in antecedent):
                Support.append(RelationRecord.support)
                Antecedent.append(ordered_stat.items_base)
                Consequent.append(ordered_stat.items_add)
                Confidence.append(ordered_stat.confidence)
                Lift.append(ordered_stat.lift)

                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    df.sort_values(by ='Lift', ascending = False, inplace = True)
    print(df)
    #display(HTML(df.to_html()))

In [2]:
import gensim

def ngrams(words, minimumCount=5, threshold=100):
    bigram = gensim.models.Phrases(words, min_count=minimumCount, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=threshold)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod


In [3]:
def HasPhraseInAbstract(phrases, textToCompare):
    for phrase in phrases:
        if phrase in textToCompare:
            return True

In [46]:
from collections import defaultdict
import csv
from pathlib import Path

def GetData():
    data = defaultdict(list)   
    listofarticles = []
    with open('metadata.csv') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            abstract = row['abstract'].lower()
            if '-' not in row['publish_time']:
                continue
            elif datetime.strptime(row['publish_time'], '%Y-%m-%d').month != datetime(2020, 3, 1).month:
                continue
            if not row['pdf_json_files']:
                continue   
            
            for json_path in row['pdf_json_files'].split(';'):
                json_file = Path(json_path)
                if json_file.is_file():
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    return listofarticles

In [47]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:    
                paragraph_text = paragraph_dict['text'].replace('sars-cov-19', 'covid')          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_text)             
                textBody.append(paragraph_text)
        text.append(paragraph_text)
    return text

In [48]:
def ReplaceAntonymsForSevere(word):
    wordsForSever = ["severe", "critical","hospitalization","death","died","dead",
                     "cytokine storm","serious","icu","critical care","acute","grave",
                     "dire","bleak","mortality","risk"]
    if word in wordsForSever:
        return "severe"
    return word

In [49]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def RemoveStopWords(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    stop_words.extend(['correspond','author','submiss','includ','present','studi','editori','contribut','author',
                       'sole','email','financi','info','author','declar','follow','payment','servic','publicli',
                       'avail','activ','express','involv','relationship','twitter','multivari','analysi','journal',
                       'accuraci','obtain','train','perform','appli','preprocess','intern','optim','sequenc',
                       'posit','specif','use','version','manuscript','read','agre','publish','wrote','review',
                       'literatur','final','version','draft','read','approv','increas','mgkg','intraven','ivermectin',
                       'mycoplasma','ovi','administ','metaanalysi','meta','analysi','find','live','peer','preval',
                       'return','meaning','life','engag','hmitoxantron','hmitoxantron','hmitoxantron','mbamb','hepg',
                       'mcfa','note','also','street','imper','parp','people','pandemic','student','ttest','lipid',
                       'accord','anime','committe','research','univers','need','muscl','panel','shown','indic',
                       'control','larger','central','subset','sampl','peerreview','copyright','holder','receiv',
                       'extern','fund','spong','httpsdoiorg','upon','request','interpret','write','data','particip',
                       'month','deidentifi','platform','nation','institut','japan','bayesian','written','inform',
                       'consent','clinic','heparanas','american','umi','test','augment','preprint','post','juli',
                       'medrxiv','acquisit','import','intellectu','content','substanti','concept','design','revis',
                       'critic','workflow','system','consum','funder','age','year','restrict','cubic','spline',
                       'incid','trend','biopsi','start','results','mirna','silico','earli','laboratori','compani',
                       'monoclon','screen','target','valid','group','formal','methodolog','vaccin',
                       'term','time','nodul','worden','wurd','delay','licens','display','tmprss','right','number',
                       'social','perinat','csc','onlin','rappel','waterpip','work','office','total',
                       'confin','chair','hour','ecog','complet','disabl','cannot','carri','selfcar','light',
                       'sector','compet','interest','center','creativ','common','guidelin','properli',
                       'cite','england','rest','patient','treatment','breast','technic','effect','conflict','disclos',
                       'cell','cells','demograph','characterist','demonstr','pictur','boehring','ingelheim','angiotensinconvert',
                       'enzym','june','current','child','january','feburary','march','april','may','august','september',
                       'october','november','december','copi','licenc','visit','httpcreativecommonsorglicen',
                       'complianc','icmj','uniform','disclosur','ethic','hajj','doctor','analyz','mmoll','tabl'])
    
    resultDocs = []
    words = set(nltk.corpus.words.words())

    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):
            lowerCasedWord = word.lower()
            lemmedWord = lemmatizer.lemmatize(ReplaceAntonymsForSevere(lowerCasedWord))
                          
            if lemmedWord not in stop_words and lemmedWord not in "" and len(lemmedWord) > 3:
                stemmedWord = porter.stem(lemmedWord)
                if stemmedWord not in stop_words:
                    result.append(porter.stem(lemmedWord))
        resultDocs.append(result)
    return resultDocs

In [50]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):
    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
    
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

In [51]:
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=10):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes,
                                                   eval_every = 1,
                                                   per_word_topics=True)

In [52]:
def GetTopics(word):
    for key, value in id2word.items(): 
        porter = PorterStemmer()
        term = porter.stem(word)
        
        if value == term: 
            #print(key, value)
            for topic in lda.get_term_topics(key):
                #print(topic)
            
                for wordId in lda.get_topic_terms(topic[0]):
                    #print(wordId)
                    print(' ,',id2word[wordId[0]])
                    


In [53]:
def GetTopicTerms(lda, word):
    bow = []
    for key, value in id2word.items(): 
        porter = PorterStemmer()
        term = porter.stem(word)
        
        if value == term: 
            #print(key, value)
            for topic in lda.get_term_topics(key):
                #print(topic)
                transactions = []
                for wordId in lda.get_topic_terms(topic[0]):
                    transactions.append(id2word[wordId[0]])
                bow.append(transactions)   
    return bow

In [58]:
from datetime import datetime
from pprint import pprint

print(datetime.now())

listOfpdfs= GetData()
    
if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    
print('Amount of pdfs gathered: ', len(listOfpdfs))
data = GetTextBodies(listOfpdfs)
cleaned_data = RemoveStopWords(data)
corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data)
lda = GetLDAModel(corpus, id2word, dictionary, numberOfTopics = 40,chunkSize=2000)
lda.show_topics()
#PrintRuleAssociation(word, 3)
    
print(datetime.now())

2020-12-15 21:12:35.343807
Amount of pdfs gathered:  6476


ValueError: One of texts or corpus has to be provided.

In [59]:
lda.show_topics()

[(26,
  '0.015*"sever" + 0.014*"diseas" + 0.011*"result" + 0.010*"infect" + 0.008*"public_health" + 0.008*"provid" + 0.007*"covid" + 0.007*"approach" + 0.006*"conclus" + 0.006*"spread"'),
 (9,
  '0.005*"materi" + 0.004*"infect" + 0.004*"process" + 0.004*"autopsi" + 0.004*"intub" + 0.004*"pellet" + 0.004*"respir" + 0.004*"para" + 0.004*"hcw" + 0.003*"region"'),
 (11,
  '0.017*"model" + 0.014*"method" + 0.009*"result" + 0.008*"propos" + 0.007*"futur" + 0.007*"user" + 0.006*"protein" + 0.005*"show" + 0.005*"approach" + 0.005*"provid"'),
 (14,
  '0.026*"infect" + 0.021*"sever" + 0.013*"diseas" + 0.012*"viru" + 0.011*"transmiss" + 0.007*"infecti" + 0.007*"covid" + 0.007*"pathogen" + 0.007*"respiratori" + 0.005*"provid"'),
 (15,
  '0.019*"viru" + 0.017*"infect" + 0.014*"protein" + 0.013*"viral" + 0.010*"gene" + 0.010*"develop" + 0.009*"respons" + 0.007*"antivir" + 0.007*"bind" + 0.007*"sever"'),
 (1,
  '0.015*"infect" + 0.012*"sever" + 0.007*"respons" + 0.006*"associ" + 0.006*"viru" + 0.005*