# Got the medical documents from:
https://github.com/socd06/medical-nlp <br />
There's another tool someone made to get all medical terminologies on that link

In [1]:
def GetMedicalTerms(filePath): 
    
    lines = []
    with open(filePath) as f:
        for line in f:
            lines.append(line.strip())
            
    return lines

# Initializing the LDA Model

In [2]:
#re-evaluate the parameters here, mess with the hyper parameter(offset parameter)
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=20, iterations=50):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   iterations=iterations,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes,
                                                   eval_every = 1)

# Get the rule associations

In [3]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(lda, support, confidence, bow, lift, length = None):   
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow,
                    min_support= support,
                    min_confidence= confidence,
                    min_lift= lift,
                    max_length= length)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
      
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            consequences = list(ordered_stat.items_base)
            antecedent = list(ordered_stat.items_add)
            Support.append(RelationRecord.support)
            Antecedent.append(ordered_stat.items_base)
            Consequent.append(ordered_stat.items_add)
            Confidence.append(ordered_stat.confidence)
            Lift.append(ordered_stat.lift)
                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
    #display(HTML(df.to_html()))
    print(df)

# Get the ngram mods

In [4]:
import gensim

def ngrams(words, minimumCount=5, threshold=5):
    bigram = gensim.models.Phrases(words, min_count=minimumCount, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=threshold)  

    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod, trigram_mod


# Get the qualifying articles from the metadata

In [5]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate):
    data = defaultdict(list)   
    listofarticles = []
    try:
        with open('metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue   
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

# Get the textbodies from the list of pdfs

In [6]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        if not os.path.exists(json_path.replace(" ", "")):
            continue
        
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:  
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])             
                textBody.append(paragraph_text)
        text.append(paragraph_text)
    return text

# Do the data pre-processing
removing stop words, lemming the words, (maybe porting?) Need to confirm not over processing

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stop_words = set(stopwords.words('english'))
  
    resultDocs = []
    words = set(nltk.corpus.words.words())
    medicalWords = GetMedicalTerms("vocab.txt")
    medicalStopWords = GetMedicalTerms("clinical-stopwords.txt")
    
    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):  
            lowerCasedWord = word.lower()
            
            if lowerCasedWord not in lowerCasedWord or lowerCasedWord in stop_words:
                continue         
            
            if lowerCasedWord == "sarscov" or lowerCasedWord == "sars-cov-2" or lowerCasedWord == "disease" or lowerCasedWord == "covid":
                lowerCasedWord = "coronavirus"
                
            if lowerCasedWord not in medicalWords or lowerCasedWord in medicalStopWords:
                continue
                
            if lowerCasedWord not in "" and len(lowerCasedWord) > 2:
                result.append(lowerCasedWord)
                
        resultDocs.append(result)
    return resultDocs

# Convert the list of documents to a corpus, create ngrams where appropriate and filter out extreme word occurences

In [8]:
from gensim.corpora.dictionary import Dictionary
import spacy

def ConvertDataToCorpus(cleaned_data):

    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
       
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.filter_extremes(no_above=0.5, keep_tokens=['coronavirus'])
    
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

# Get all the document topics from the given corpus

minimum_probability maybe changes as the LDA model is updated

In [22]:
#change this to get only top topics
#for the coherence score read this later to get a better understand:
#  http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
#u_mass values seems like as it approaches 0, the score is better
# read this too: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
import pprint as pp

def GetTopicTerms(lda, lowerEndCoherenceScore):

#testing out the other coherence calculations, this value should be positive and the closer to 1 the better
# according to https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
# but need to confirm
    print(datetime.datetime.now())
    topTopics = lda.top_topics(corpus=None,
                               texts=cleaned_data,
                               dictionary=id2word,
                               coherence='c_npmi',
                               topn=20, 
                               processes=10)
    print(len(topTopics))
    bow = []
    for topic in topTopics:
        transaction = []
        if topic[1] > lowerEndCoherenceScore:
            print("Coherence score: ", topic[1])
            pp.pprint(topic[0])
            for word in topic[0]:
                transaction.append(word[1])
            bow.append(transaction)
    print(datetime.datetime.now())
    return bow

# Printing the topic terms per a given word

In [10]:
def PrintTopicTermsPerWord(lda, word, minProbablity = None):
    bow = []
    for key, value in id2word.items():
        porter = PorterStemmer()
        term = porter.stem(word)
        if value == term: 
            topics = lda.get_term_topics(key, minimum_probability= minProbablity)
            print(len(topics))
            for topic in topics:
                print("Topic: ")
                transaction = []
                for wordId in lda.get_topic_terms(topic[0], topn=20):
                    print(id2word[wordId[0]])
                    transaction.append(id2word[wordId[0]])
                bow.append(transaction)
                print("\n")
    return bow

In [11]:
#split this up in a way that's easier to test, try splitting it up just a month at a time, find a way to visual topics
#per slice

import datetime
from pprint import pprint

listOfpdfs= GetData(datetime.date(2020, 3, 1), datetime.date(2020, 4, 1))

if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
    
print('Amount of pdfs gathered: ', len(listOfpdfs))

Amount of pdfs gathered:  2273


In [12]:
data = GetTextBodies(listOfpdfs)

In [13]:
cleaned_data = CleanTheData(data)    

In [14]:
print(datetime.datetime.now())
corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data)

print(len(id2word))
print(datetime.datetime.now())
#3930 when filtering out 25% extremes
#1612 when filtering out 50% extremes

2021-01-02 11:26:17.552789
1650
2021-01-02 11:26:19.167483


In [15]:
#look into https://radimrehurek.com/gensim/models/ldaseqmodel.html
print(datetime.datetime.now())
lda = GetLDAModel(corpus,
                  id2word, 
                  dictionary,
                  numberOfTopics = 500,
                  chunkSize=2000,
                  passes=50, 
                  iterations=50)

print(datetime.datetime.now())

2021-01-02 11:26:19.174177
2021-01-02 11:28:10.067714


In [23]:
bow = GetTopicTerms(lda, 0.7)
#lda.top_topics(corpus)

2021-01-02 11:32:59.160597
500
Coherence score:  inf
[(0.0006064713, 'coronavirus'),
 (0.0006064033, 'effect'),
 (0.0006063341, 'role'),
 (0.0006062688, 'epidemic'),
 (0.0006062593, 'affinity'),
 (0.00060625194, 'pneumonia'),
 (0.00060624967, 'treating'),
 (0.00060624967, 'unknown'),
 (0.0006062456, 'needs'),
 (0.0006062365, 'inhibitor'),
 (0.0006062344, 'enzyme'),
 (0.0006062342, 'tool'),
 (0.00060622994, 'specific'),
 (0.0006062286, 'absence'),
 (0.0006062269, 'vitamin'),
 (0.0006062245, 'drug'),
 (0.00060622324, 'receptor'),
 (0.00060621527, 'polymerase'),
 (0.00060621486, 'binding_site'),
 (0.0006062118, 'strong')]
Coherence score:  inf
[(0.20614797, 'risk'),
 (0.1680231, 'basis'),
 (0.12571235, 'coronavirus'),
 (0.054085292, 'increased_risk'),
 (0.04569072, 'coronavirus_infection'),
 (0.045526456, 'screen'),
 (0.037608016, 'initiation'),
 (0.03560043, 'view'),
 (0.028438082, 'screening'),
 (0.025883216, 'physicians'),
 (0.023873279, 'increased'),
 (0.021308988, 'history'),
 (0.012

[(0.18084317, 'increase'),
 (0.13735566, 'decrease'),
 (0.048756722, 'west'),
 (0.04164039, 'reduced'),
 (0.030675702, 'strength'),
 (0.028157331, 'time'),
 (0.027988061, 'study'),
 (0.027865242, 'layers'),
 (0.02097584, 'exposure'),
 (0.017263083, 'using'),
 (0.017191576, 'protection'),
 (0.015813803, 'diseases'),
 (0.015079831, 'detection'),
 (0.014827125, 'small'),
 (0.0148188835, 'pulmonary'),
 (0.014597876, 'significant'),
 (0.014553792, 'higher'),
 (0.013940702, 'early_stage'),
 (0.013940698, 'swab'),
 (0.013936924, 'degree')]
Coherence score:  inf
[(0.06724246, 'minimal'),
 (0.058834177, 'school'),
 (0.058822762, 'respiratory_symptoms'),
 (0.05859776, 'closure'),
 (0.050436348, 'member'),
 (0.05043523, 'close_contact'),
 (0.044749226, 'coronavirus'),
 (0.033628546, 'adult'),
 (0.025224777, 'case_series'),
 (0.02522461, 'repeated'),
 (0.025130332, 'epidemic'),
 (0.024846638, 'negative'),
 (0.020115284, 'environment'),
 (0.018624797, 'transmission'),
 (0.018118687, 'community'),
 

 (0.009148461, 'case'),
 (0.008881176, 'viral'),
 (0.008755057, 'presentation')]
Coherence score:  inf
[(0.20400178, 'following'),
 (0.15708806, 'events'),
 (0.09056649, 'event'),
 (0.0622334, 'coronavirus'),
 (0.048213366, 'current'),
 (0.041491605, 'acquired'),
 (0.038057804, 'due'),
 (0.03301149, 'recognition'),
 (0.024900343, 'special_attention'),
 (0.023111971, 'pneumonia'),
 (0.019114036, 'attention'),
 (0.018841831, 'cardiovascular'),
 (0.016609974, 'acute'),
 (0.016607465, 'phase'),
 (0.01636881, 'resolution'),
 (0.013538745, 'period'),
 (0.012593706, 'community'),
 (0.00910928, 'adults'),
 (0.008872142, 'complications'),
 (0.008853062, 'management')]
Coherence score:  inf
[(0.1410681, 'manifestations'),
 (0.0919229, 'coronavirus'),
 (0.05212683, 'rest'),
 (0.044196405, 'suspect'),
 (0.027823105, 'subpleural'),
 (0.026526658, 'liver'),
 (0.023767486, 'typical'),
 (0.01769025, 'damage'),
 (0.01769025, 'immune'),
 (0.017690204, 'density'),
 (0.017690204, 'admission'),
 (0.0176902

In [21]:
len(bow)

9

In [26]:
PrintRuleAssociation(lda, support=0.01, confidence=0.9, bow=bow, lift = 1,length=3)

                   Left Hand Side  \
1386  {polymerase_chain_reaction}   
1170                {recognition}   
1171      {cardiovascular, phase}   
2161            {regimen, sepsis}   
604                        {died}   
...                           ...   
1702     {hypertension, receptor}   
93                     {receptor}   
1910        {transmission, virus}   
1727    {transmission, infection}   
41        {coronavirus_pneumonia}   

                                  Right Hand Side   Support  Confidence  \
1386                           {chest, opacities}  0.012987    1.000000   
1170                      {cardiovascular, phase}  0.012987    1.000000   
1171                                {recognition}  0.012987    1.000000   
2161                                       {died}  0.012987    1.000000   
604   {cell, acute_respiratory_distress_syndrome}  0.012987    1.000000   
...                                           ...       ...         ...   
1702                          