## TO DO
1. See if maybe some kind of cluster helps with making accurate topics and rule associations
2. Make 10? different clusters of documents and then run LDA on them

# Got the medical documents from:
https://github.com/socd06/medical-nlp <br />
There's another tool someone made to get all medical terminologies on that link

In [1]:
def GetMedicalTerms(filePath): 
    
    lines = []
    with open(filePath) as f:
        for line in f:
            lines.append(line.strip().lower())
            
    return lines

# Initializing the LDA Model
https://radimrehurek.com/gensim/models/ldaseqmodel.html

In [2]:
def GetLDAModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000, passes=20, iterations=50):

    temp = dictionary[0]  

    return gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                   num_topics=numberOfTopics,
                                                   id2word=id2word,
                                                   chunksize=chunkSize,
                                                   iterations=iterations,
                                                   workers=5, # Num. Processing Cores - 1
                                                   passes=passes)

# Get the rule associations

In [3]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(lda, support, confidence, bow, lift, length = None):   
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow,
                    min_support= support,
                    min_confidence= confidence,
                    min_lift= lift,
                    max_length= length)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift',
                               'Count'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
    Count = []
      
    maxCount = 0
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            #print("count:", len(ordered_stat.items_base) + len(ordered_stat.items_add))
            #print("cons:", ordered_stat.items_add)
            if IsAntecedentDifferentFromConsequent(ordered_stat.items_base, ordered_stat.items_add):
                if maxCount < len(ordered_stat.items_base) + len(ordered_stat.items_add):
                    maxCount = len(ordered_stat.items_base) + len(ordered_stat.items_add)
                    
                consequences = list(ordered_stat.items_base)
                antecedent = list(ordered_stat.items_add)
                Support.append(RelationRecord.support)
                Antecedent.append(ordered_stat.items_base)
                Consequent.append(ordered_stat.items_add)
                Confidence.append(ordered_stat.confidence)
                Lift.append(ordered_stat.lift)
                Count.append(len(ordered_stat.items_base) + len(ordered_stat.items_add))
                                          
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    df['Count'] = Count
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
            
    #for index, row in df.iterrows():
    #    if row['Count'] >= maxCount-1:
    #        print(row)
    
    display(HTML(df.to_html()))
    #print(df)

In [4]:
def RemoveAllLowerCountAssociationRules():
    print("tbd")

In [5]:
from nltk.stem import LancasterStemmer

def SameStem(word1, word2, printStemmedWords = False):
    lancaster = LancasterStemmer()
    if printStemmedWords:
        print(lancaster.stem(word1), lancaster.stem(word2))

    return lancaster.stem(word1) == lancaster.stem(word2)

In [6]:
def IsAntecedentDifferentFromConsequent(Antecedent, Consequent):
    for ant in Antecedent:
        for cons in Consequent:
            if SameStem(ant, cons):
                return False
        
    return True
    

# Create the ngrams and set the ngram thresholds
min_count – Ignore all words and bigrams with total collected count lower than this value.
threshold – Represent a score threshold for forming the phrases. A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold. Heavily depends on concrete scoring-function, see the scoring parameter.


In [7]:
import gensim
from gensim.models.phrases import Phrases,Phraser

def ngrams(words, minimumCount=5, threshold=15):
    bigram = Phrases(words,
                     min_count=minimumCount,
                     threshold=threshold)
    
    trigram = Phrases(bigram[words],
                      threshold=threshold)  

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod


# Get the qualifying articles from the metadata
1. Qualifies if the publication date is within a given date range 
2. Has a pmc jcon file associated

In [8]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(searchTerm, startDate, endDate):
    data = defaultdict(list)   
    listofarticles = []
    try:
        with open('metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue   
                if not searchTerm.lower() in row['abstract'].lower():
                    continue
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(json_path)
                    data[row['cord_uid']] = True
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

# Get the textbodies from the list of pdfs
1. Remove all special characters

In [9]:
import json
import re

def GetTextBodies(listOfpdfs):    
    text = []    
    for json_path in listOfpdfs:
        if not os.path.exists(json_path.replace(" ", "")):
            continue
        
        with open(json_path.replace(" ", "")) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])             
                textBody.append(paragraph_text)
                
        text.append(paragraph_text)
    return text

# Do the data pre-processing
1. remove stop words
2. lower case all words
3. Check to see if the word is within the list of medical terms given

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stop_words = set(stopwords.words('english'))
  
    resultDocs = []
    words = set(nltk.corpus.words.words())
    medicalWords = GetMedicalTerms("vocab.txt")
    medicalStopWords = GetMedicalTerms("clinical-stopwords.txt")
    
    for doc in listOfDocs:
        result = []
        for word in doc.split(' '):  
            lowerCasedWord = word.lower()
            
            if lowerCasedWord in ['sarscov','sars-cov-2','coronavirus','sarscov']:
                lowerCasedWord = 'covid'
            
            if lowerCasedWord in "" or len(lowerCasedWord) <= 2 or lowerCasedWord in stop_words or lowerCasedWord not in medicalWords or lowerCasedWord in medicalStopWords:
                continue
                
            result.append(lowerCasedWord)
                
        resultDocs.append(result)
    return resultDocs

# Create the corpus
1. Create the dictionary with all words and word ids
2. Create the bi,tri, and quadgrams if applicable
3. remove extreme occurences of words

In [11]:
from gensim.corpora.dictionary import Dictionary
import spacy

def ConvertDataToCorpus(cleaned_data):

    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
       
    id2word = gensim.corpora.Dictionary(ngram)
    #id2word.filter_extremes(no_below=10, no_above=0.90, keep_tokens=['covid'])
    
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

# Get all qualifying topics generated from LDA model
1. Get all topics from the LDA model
2. Per each topic, check to see if the coherence score is greater than the lower bounce given
3. Per word, check to see if the word or a related word exists in the current topic, if it does, do not add
4. Per word, check to see if the probablity that the word is in the current topic is greater than the lower bound given
5. Do not consider topics with only one qualifying word

Notes: can read about coherence scores here: http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf


In [12]:
import pprint as pp

def GetTopicTerms(searchTerm, lda, id2word, cleaned_data, lowerEndCoherenceScore, numberOfWords, minimumprobablity, windowSize, processes, printTopic = False):

    topTopics = lda.top_topics(corpus=None,
                               texts=cleaned_data,
                               dictionary=id2word,
                               coherence='c_v',
                               window_size=windowSize,
                               topn=numberOfWords, 
                               processes=processes)
    bow = []
    for topic in topTopics:
        transaction = []
        displayTransaction = []
        if printTopic:
            pp.pprint(topic)
        
        if topic[1] < lowerEndCoherenceScore:
            continue
            
        for word in topic[0]:
            l = []
            l.append(word[1])
            
            if word[0] > minimumprobablity and IsAntecedentDifferentFromConsequent(transaction, l):
                transaction.append(word[1])
                displayTransaction.append(word)
                
        if len(transaction) > 1:
            #pp.pprint(displayTransaction)
            bow.append(transaction)
            
    return bow


In [15]:
import datetime
def GetInformation(searchTerm, minProbablity, startDate, endDate):
    
    listOfpdfs= GetData(searchTerm, startDate, endDate)
    
    if len(listOfpdfs) == 0:
        print("No PDFs found under this topic")
        exit
        
    print(len(listOfpdfs))
    data = GetTextBodies(listOfpdfs)
    print('Finished gathering data', datetime.datetime.now())
    cleaned_data = CleanTheData(data)  
    print('Finished cleaning data', datetime.datetime.now())  
    corpus, id2word, dictionary = ConvertDataToCorpus(cleaned_data) 
    print('Finished converting data to corpus', datetime.datetime.now())  
    
    lda = GetLDAModel(corpus,
                      id2word, 
                      dictionary,
                      numberOfTopics = 100,
                      chunkSize=2000,
                      passes=100, 
                      iterations=100)  
    print('Finished creating LDA model', datetime.datetime.now())  
    
    topics = GetTopicTerms(searchTerm,
                           lda = lda, 
                           id2word = id2word,
                           cleaned_data=cleaned_data, 
                           lowerEndCoherenceScore=0, 
                           numberOfWords=30, 
                           minimumprobablity= minProbablity,
                           windowSize= 15,
                           processes=10)
    
    print('Finished creating LDA model', datetime.datetime.now())  
    
    PrintRuleAssociation(lda, 
                         support=0.1,
                         confidence=0.8,
                         bow=topics, 
                         lift = 1, 
                         length = None)
    print('Finished', datetime.datetime.now()) 



In [19]:
searchTerm = 'diabetes'

GetInformation(searchTerm, 0.01, datetime.date(2020, 3, 1), datetime.date(2020, 4, 1))

20
Finished gathering data 2021-01-15 21:13:53.965387
Finished cleaning data 2021-01-15 21:13:54.760669
Finished converting data to corpus 2021-01-15 21:13:54.778365
Finished creating LDA model 2021-01-15 21:14:04.872478
Finished creating LDA model 2021-01-15 21:14:15.874312


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count
145,"{patients, outcomes, covid, risk}",{admission},0.142857,1.0,7.0,5
133,"{covid, risk, admission}","{patients, outcomes}",0.142857,1.0,7.0,5
121,{admission},"{patients, outcomes, covid, risk}",0.142857,1.0,7.0,5
123,"{covid, admission}","{patients, outcomes, risk}",0.142857,1.0,7.0,5
124,"{outcomes, admission}","{patients, covid, risk}",0.142857,1.0,7.0,5
125,"{patients, admission}","{outcomes, covid, risk}",0.142857,1.0,7.0,5
126,"{risk, admission}","{patients, outcomes, covid}",0.142857,1.0,7.0,5
127,"{outcomes, covid}","{patients, admission, risk}",0.142857,1.0,7.0,5
128,"{covid, risk}","{patients, outcomes, admission}",0.142857,1.0,7.0,5
129,"{patients, outcomes}","{admission, covid, risk}",0.142857,1.0,7.0,5


Finished 2021-01-15 21:14:16.031875


In [20]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 4, 1), datetime.date(2020, 5, 1))

72
Finished gathering data 2021-01-15 21:14:32.145951
Finished cleaning data 2021-01-15 21:14:34.647764
Finished converting data to corpus 2021-01-15 21:14:34.707953
Finished creating LDA model 2021-01-15 21:14:49.599708


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Finished creating LDA model 2021-01-15 21:15:03.946460


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count
17,"{patients, risk}",{diabetes},0.128205,0.833333,3.25,3
16,"{diabetes, risk}",{patients},0.128205,0.833333,1.911765,3
15,"{severe, risk}",{covid},0.102564,1.0,1.56,3
14,"{patients, treatment}",{covid},0.128205,0.833333,1.3,3
13,"{patients, mortality}",{covid},0.102564,1.0,1.56,3
12,"{covid, diabetes}",{patients},0.179487,0.875,2.007353,3
1,{chronic},{covid},0.102564,1.0,1.56,2
11,{results},{risk},0.102564,1.0,3.545455,2
10,{diabetes},{patients},0.230769,0.9,2.064706,2
0,{cardiovascular},{covid},0.102564,1.0,1.56,2


Finished 2021-01-15 21:15:03.990914


In [21]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 5, 1), datetime.date(2020, 6, 1))

162
Finished gathering data 2021-01-15 21:15:20.786261
Finished cleaning data 2021-01-15 21:15:26.573796
Finished converting data to corpus 2021-01-15 21:15:26.725638
Finished creating LDA model 2021-01-15 21:15:53.164250


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Finished creating LDA model 2021-01-15 21:16:11.705571


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count
56,"{patients, outcomes, risk}",{covid},0.109091,1.0,1.410256,4
55,"{outcomes, covid, risk}",{patients},0.109091,1.0,1.774194,4
54,"{patients, outcomes, covid}",{risk},0.109091,0.857143,2.773109,4
53,"{outcomes, risk}","{patients, covid}",0.109091,1.0,1.964286,4
52,"{patients, outcomes}","{covid, risk}",0.109091,0.857143,3.367347,4
41,"{patients, outcomes}",{covid},0.127273,1.0,1.410256,3
31,{drugs},"{patients, covid}",0.109091,0.857143,1.683673,3
32,"{drugs, covid}",{patients},0.109091,1.0,1.774194,3
33,"{patients, drugs}",{covid},0.109091,0.857143,1.208791,3
34,{infection},"{patients, covid}",0.163636,0.9,1.767857,3


Finished 2021-01-15 21:16:11.768968


In [22]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 6, 1), datetime.date(2020, 7, 1))

178
Finished gathering data 2021-01-15 21:16:27.101562
Finished cleaning data 2021-01-15 21:16:33.611230
Finished converting data to corpus 2021-01-15 21:16:33.777544
Finished creating LDA model 2021-01-15 21:17:02.920060


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Finished creating LDA model 2021-01-15 21:17:24.591578


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count
50,"{patients, severe, risk}",{covid},0.142857,1.0,1.272727,4
49,"{covid, severe, risk}",{patients},0.142857,1.0,1.707317,4
48,"{severe, risk}","{patients, covid}",0.142857,1.0,1.794872,4
37,"{patients, risk}",{covid},0.214286,1.0,1.272727,3
28,"{health, patients}",{covid},0.142857,1.0,1.272727,3
29,"{patients, high}",{covid},0.1,1.0,1.272727,3
30,"{patients, infection}",{covid},0.171429,1.0,1.272727,3
31,"{severe, infection}",{covid},0.1,1.0,1.272727,3
32,"{patients, mortality}",{covid},0.114286,1.0,1.272727,3
33,{outcomes},"{patients, covid}",0.114286,0.888889,1.595442,3


Finished 2021-01-15 21:17:24.647482


In [23]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 7, 1), datetime.date(2020, 8, 1))

210
Finished gathering data 2021-01-15 21:17:40.175995
Finished cleaning data 2021-01-15 21:17:46.195793
Finished converting data to corpus 2021-01-15 21:17:46.368154
Finished creating LDA model 2021-01-15 21:26:46.512548


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


Finished creating LDA model 2021-01-15 21:27:26.379398


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count
45,"{patients, diabetes, risk}",{covid},0.115942,1.0,1.408163,4
44,"{covid, diabetes, risk}",{patients},0.115942,1.0,1.604651,4
43,"{patients, associated, severe}",{covid},0.101449,0.875,1.232143,4
42,"{associated, covid, severe}",{patients},0.101449,1.0,1.604651,4
33,"{mortality, covid}",{patients},0.15942,0.916667,1.47093,3
26,"{healthcare, patients}",{covid},0.101449,1.0,1.408163,3
27,{high},"{patients, covid}",0.115942,1.0,1.725,3
28,"{covid, high}",{patients},0.115942,1.0,1.604651,3
29,"{patients, high}",{covid},0.115942,1.0,1.408163,3
30,"{covid, infection}",{patients},0.130435,0.818182,1.312896,3


Finished 2021-01-15 21:27:26.433032


In [None]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 8, 1), datetime.date(2020, 9, 1))

193
Finished gathering data 2021-01-15 21:27:47.164758
Finished cleaning data 2021-01-15 21:28:03.270497
Finished converting data to corpus 2021-01-15 21:28:03.550808
Finished creating LDA model 2021-01-15 21:29:24.758310


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


In [None]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 9, 1), datetime.date(2020, 10, 1))

In [None]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 10, 1), datetime.date(2020, 11, 1))

In [None]:
GetInformation(searchTerm, 0.01,datetime.date(2020, 11, 1), datetime.date(2020, 12, 1))