In [1]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(lda, support, confidence):
    bow = []
    
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    bow = GetTopicTerms(lda) 
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow, min_support = support, min_confidence = confidence)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
      
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            consequences = list(ordered_stat.items_base)
            antecedent = list(ordered_stat.items_add)
            Support.append(RelationRecord.support)
            Antecedent.append(ordered_stat.items_base)
            Consequent.append(ordered_stat.items_add)
            Confidence.append(ordered_stat.confidence)
            Lift.append(ordered_stat.lift)
                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
    
    print(df)
    display(HTML(df.to_html()))

In [2]:
import gensim

def ngrams(words, minimumCount=5, threshold=100):
    bigram = gensim.models.Phrases(words, min_count=minimumCount, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=threshold)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod


In [3]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate):
    data = defaultdict(list)   
    listofarticles = []
    try:
        with open('metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue   
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

In [4]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        if not os.path.exists(json_path.replace(" ", "")):
            continue
        
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])             
                textBody.append(paragraph_text)
        text.append(paragraph_text)
    return text

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')
    
    resultDocs = []
    words = set(nltk.corpus.words.words())

    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):
            lowerCasedWord = word.lower() 
            #check and see if lemmatizer can be extended
            lowerCasedWord.replace('sarscov', 'covid')
            lemmedWord = lemmatizer.lemmatize(lowerCasedWord)
                          
            if lemmedWord not in stop_words and lemmedWord not in "" and len(lemmedWord) > 3:
                stemmedWord = porter.stem(lemmedWord)
                if stemmedWord not in stop_words:
                    result.append(porter.stem(lemmedWord))
        resultDocs.append(result)
    return resultDocs

In [6]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):
    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
    
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

In [7]:
#re-evaluate the parameters here, mess with the hyper parameter(offset parameter)
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=10):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes,
                                                   eval_every = 1,
                                                   per_word_topics=True)

In [8]:
#change this to get only top topics
#for the coherence score read this later to get a better understand:
#  http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
#u_mass values seems like as it approaches 0, the score is better
# read this too: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
def GetTopicTerms(lda):
    bow = []
   
    for topic in lda.top_topics(corpus=corpus,topn=10):
        transaction = []
        for word in topic[0]:
            transaction.append(word[1])
        bow.append(transaction)  
        
    return bow

In [9]:
#split this up in a way that's easier to test, try splitting it up just a month at a time, find a way to visual topics
#per slice

import datetime
from pprint import pprint

listOfpdfs= GetData(datetime.date(2020, 3, 1), datetime.date(2020, 4, 1))
    
print(len(listOfpdfs))
if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
    
print('Amount of pdfs gathered: ', len(listOfpdfs))

2273
Amount of pdfs gathered:  2273


In [10]:
data = GetTextBodies(listOfpdfs)

In [11]:
cleaned_data = CleanTheData(data)    

In [12]:
corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data)

lda = GetLDAModel(corpus, id2word, dictionary, numberOfTopics = 500,chunkSize=2000,passes=10)

In [13]:



lda.top_topics(corpus=corpus, topn=20, processes=2)

[([(8.77193e-05, 'heftigen'),
   (8.77193e-05, 'normalisierung'),
   (8.77193e-05, 'rahmen'),
   (8.77193e-05, 'belastungen'),
   (8.77193e-05, 'reduzierendabei'),
   (8.77193e-05, 'rolle\n'),
   (8.77193e-05, 'psychisch'),
   (8.77193e-05, 'redukt'),
   (8.77193e-05, 'deren'),
   (8.77193e-05, 'grundbedrfnissen'),
   (8.77193e-05, 'gesundheitssystem'),
   (8.77193e-05, 'gesundheitsfachkrften'),
   (8.77193e-05, 'gesundheitsfachkrft'),
   (8.77193e-05, 'gegeben'),
   (8.77193e-05, 'funktionsfhigkeit'),
   (8.77193e-05, 'fhrungskrft'),
   (8.77193e-05, 'erst'),
   (8.77193e-05, 'hinweis'),
   (8.77193e-05, 'individuel'),
   (8.77193e-05, 'konfrontiertdi')],
  -0.19699972275217387),
 ([(8.77193e-05, 'heftigen'),
   (8.77193e-05, 'normalisierung'),
   (8.77193e-05, 'rahmen'),
   (8.77193e-05, 'belastungen'),
   (8.77193e-05, 'reduzierendabei'),
   (8.77193e-05, 'rolle\n'),
   (8.77193e-05, 'psychisch'),
   (8.77193e-05, 'redukt'),
   (8.77193e-05, 'deren'),
   (8.77193e-05, 'grundbedrfnis

In [14]:
PrintRuleAssociation(lda, support=0.04, confidence= 0.9)

Empty DataFrame
Columns: [Left Hand Side, Right Hand Side, Support, Confidence, Lift]
Index: []


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift


In [15]:
#testing out the other coherence calculations, this value should be positive and the closer to 1 the better
# according to https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
# but need to confirm
lda.top_topics(texts=cleaned_data, topn=10, coherence='c_v', processes=2)

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


[([(0.11761191, 'servic'),
   (0.053664368, 'mental_health'),
   (0.051706586, 'outbreak'),
   (0.03796014, 'covid'),
   (0.03074877, 'psycholog'),
   (0.027809734, 'profession'),
   (0.022275142, 'health'),
   (0.01692962, 'onlin'),
   (0.016815094, 'limit'),
   (0.014695931, 'china')],
  nan),
 ([(0.039617132, 'chloroquin'),
   (0.026545938, 'covid'),
   (0.025761522, 'hydroxychloroquin'),
   (0.020527331, 'effect'),
   (0.01668034, 'definit'),
   (0.016531792, 'studi'),
   (0.015497381, 'patient'),
   (0.014055117, 'coronaviru'),
   (0.013985955, 'chloroquin_hydroxychloroquin'),
   (0.0134264175, 'sever')],
  nan),
 ([(0.023617635, 'task'),
   (0.02337623, 'heurist'),
   (0.020160856, 'deliber'),
   (0.016856292, 'resourc'),
   (0.01680542, 'ignor'),
   (0.016690578, 'graphpad_prism'),
   (0.016622147, 'varianc'),
   (0.013737445, 'action'),
   (0.013438662, 'termin'),
   (0.0130301975, 'mean')],
  nan),
 ([(8.77193e-05, 'heftigen'),
   (8.77193e-05, 'hinweis'),
   (8.77193e-05, 'er

In [41]:
for key, value in id2word.items():
    porter = PorterStemmer()
    term = porter.stem('diabetes')
    if value == term: 
        #print(key, value)
        for topic in lda.get_term_topics(key, minimum_probability=.002):
            print('topic:\n')
            for wordId in lda.get_topic_terms(topic[0]):
                #print(wordId)
                print(wordId[1],' + ',id2word[wordId[0]])
                #gets topics from term topics and see those topics

topic:

0.057450406  +  cancer
0.038848348  +  patient
0.026365692  +  studi
0.02592696  +  stage
0.01850926  +  analysi
0.01645379  +  covid
0.015528777  +  tumour
0.015509732  +  could
0.0154992975  +  sever
0.014928781  +  includ
topic:

0.049078632  +  behavior
0.02048491  +  risk
0.020273184  +  toward
0.01965402  +  viru
0.018816233  +  april
0.017399246  +  potenti
0.016752968  +  citrullin
0.016662024  +  talk
0.01617844  +  target
0.013852518  +  popul
topic:

0.027369713  +  state
0.02210863  +  video
0.02027462  +  strategi
0.015869278  +  variou
0.0152003635  +  featur
0.015173094  +  immunotherapi
0.014202444  +  system
0.013536755  +  medium
0.01339143  +  model
0.011573121  +  optim
topic:

0.111539826  +  assess
0.04641379  +  chemic
0.035938453  +  period
0.030963568  +  toxic
0.03092574  +  tool
0.029620431  +  bring
0.026033904  +  factor
0.021379003  +  appli
0.017732542  +  use
0.017016102  +  make
topic:

0.061739393  +  covid
0.04678524  +  center
0.032838754  + 