In [1]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(lda, support, confidence):
    bow = []
    
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    bow = GetTopicTerms(lda) 
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow, min_support = support, min_confidence = confidence)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
      
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            consequences = list(ordered_stat.items_base)
            antecedent = list(ordered_stat.items_add)
            Support.append(RelationRecord.support)
            Antecedent.append(ordered_stat.items_base)
            Consequent.append(ordered_stat.items_add)
            Confidence.append(ordered_stat.confidence)
            Lift.append(ordered_stat.lift)
                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
    
    print(df)
    display(HTML(df.to_html()))

In [2]:
import gensim

def ngrams(words, minimumCount=5, threshold=100):
    bigram = gensim.models.Phrases(words, min_count=minimumCount, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=threshold)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod


In [3]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate):
    data = defaultdict(list)   
    listofarticles = []
    try:
        with open('metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue   
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

In [4]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        if not os.path.exists(json_path.replace(" ", "")):
            continue
        
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])             
                textBody.append(paragraph_text)
        text.append(paragraph_text)
    return text

In [34]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    resultDocs = []
    words = set(nltk.corpus.words.words())

    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):
            lowerCasedWord = word.lower() 
            #check and see if lemmatizer can be extended
            lowerCasedWord.replace('sarscov', 'covid')
            lemmedWord = lemmatizer.lemmatize(lowerCasedWord)
                          
            if lemmedWord not in stop_words and lemmedWord not in "" and len(lemmedWord) > 3:
                stemmedWord = porter.stem(lemmedWord)
                if stemmedWord not in stop_words:
                    result.append(porter.stem(lemmedWord))
        resultDocs.append(result)
    return resultDocs

In [35]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):
    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
    
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

In [36]:
#re-evaluate the parameters here, mess with the hyper parameter(offset parameter)
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=10):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes,
                                                   eval_every = 1,
                                                   per_word_topics=True)

In [37]:
#change this to get only top topics
#for the coherence score read this later to get a better understand:
#  http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
#u_mass values seems like as it approaches 0, the score is better
# read this too: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
def GetTopicTerms(lda):
    bow = []
   
    for topic in lda.top_topics(corpus=corpus,topn=10):
        transaction = []
        for word in topic[0]:
            transaction.append(word[1])
        bow.append(transaction)  
        
    return bow

In [38]:
#split this up in a way that's easier to test, try splitting it up just a month at a time, find a way to visual topics
#per slice

import datetime
from pprint import pprint

listOfpdfs= GetData(datetime.date(2020, 3, 1), datetime.date(2020, 4, 1))
    
print(len(listOfpdfs))
if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
    
print('Amount of pdfs gathered: ', len(listOfpdfs))

2273
Amount of pdfs gathered:  2273


In [39]:
data = GetTextBodies(listOfpdfs)

In [40]:
cleaned_data = CleanTheData(data)    

In [41]:
corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data)

lda = GetLDAModel(corpus, id2word, dictionary, numberOfTopics = 200,chunkSize=2000,passes=10)

In [47]:
lda.top_topics(corpus=corpus, topn=20, processes=2)

[([(0.043948494, 'case'),
   (0.020824976, 'number'),
   (0.019289218, 'estim'),
   (0.015204335, 'travel'),
   (0.013967068, 'wuhan'),
   (0.0134283155, 'covid'),
   (0.012447829, 'januari'),
   (0.011834697, 'incub_period'),
   (0.011666821, 'detect'),
   (0.011066739, 'public'),
   (0.010828302, 'import'),
   (0.009870556, 'effect'),
   (0.008824487, 'model'),
   (0.008528512, 'health'),
   (0.0076180953, 'patient'),
   (0.007579131, 'epidem'),
   (0.0075247893, 'diseas'),
   (0.007364897, 'sever'),
   (0.0071209553, 'viru'),
   (0.007113982, 'use')],
  -1.6784207123273607),
 ([(0.059808783, 'patient'),
   (0.03713663, 'infect'),
   (0.026678102, 'covid'),
   (0.015201948, 'treatment'),
   (0.01289982, 'care'),
   (0.011799896, 'manag'),
   (0.011435694, 'medic'),
   (0.011109723, 'provid'),
   (0.010902106, 'prevent'),
   (0.010346228, 'health'),
   (0.009994185, 'critic'),
   (0.009766004, 'clinic'),
   (0.00929166, 'diseas'),
   (0.008954015, 'risk'),
   (0.008421485, 'isol'),
  

In [43]:
PrintRuleAssociation(lda, support=0.04, confidence= 0.9)

Empty DataFrame
Columns: [Left Hand Side, Right Hand Side, Support, Confidence, Lift]
Index: []


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift


In [48]:
#testing out the other coherence calculations, this value should be positive and the closer to 1 the better
# according to https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
# but need to confirm
lda.top_topics(texts=cleaned_data, topn=10, coherence='c_v', processes=2)

[([(0.041793656, 'infect'),
   (0.025679376, 'covid'),
   (0.022881377, 'procedur'),
   (0.019486638, 'endoscopi'),
   (0.017628338, 'patient'),
   (0.015585302, 'transmiss'),
   (0.012853877, 'control'),
   (0.011885714, 'spread'),
   (0.010967948, 'china'),
   (0.010177117, 'diseas')],
  0.6211934085134743),
 ([(0.030900158, 'inform'),
   (0.025783697, 'crisi'),
   (0.012414702, 'studi'),
   (0.011647672, 'behavior'),
   (0.011386172, 'profession'),
   (0.009717556, 'import'),
   (0.0093272785, 'relev'),
   (0.009320417, 'librarian'),
   (0.00910088, 'pattern'),
   (0.008956155, 'final')],
  0.6131701866392273),
 ([(0.01940911, 'pandem'),
   (0.018188719, 'implement'),
   (0.01279476, 'vaccin'),
   (0.01202763, 'countri'),
   (0.010396772, 'studi'),
   (0.009939326, 'season_influenza'),
   (0.009786827, 'effect'),
   (0.009586816, 'use'),
   (0.008504322, 'polici'),
   (0.008467808, 'earli')],
  nan),
 ([(0.020532068, 'use'),
   (0.01760341, 'epitop'),
   (0.015661748, 'cell'),
   (0