In [1]:
def GetArticlesFromDateRange(startDate, endDate, dateToCheck):
    castedDateToCheck = datetime.datetime.strptime(dateToCheck, '%Y-%m-%d').date()
    if castedDateToCheck >= startDate and castedDateToCheck < endDate:
        return True
    
    return False

In [2]:
class article(object):
    def __init__(self, date, information):
        self.Date = date
        self.Information = information

# Got the medical documents from:
https://github.com/socd06/medical-nlp <br />
There's another tool someone made to get all medical terminologies on that link

In [3]:
def GetMedicalTerms(filePath): 
    
    lines = []
    with open(filePath) as f:
        for line in f:
            lines.append(line.strip().lower())
            
    return lines

# Initializing the LDA Model
https://radimrehurek.com/gensim/models/ldaseqmodel.html

In [4]:
from gensim.models import LdaSeqModel

def GetTopicModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000):

    temp = dictionary[0]  
    
    return LdaSeqModel(corpus=corpus,
                       time_slice=[2330, 5992, 9234],#,9486,8986,8470,9178,9325,8368,7284,8857,5059],
                       num_topics=numberOfTopics,
                       id2word=id2word,
                       chunksize=chunkSize)

# Get the rule associations

In [5]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(support, confidence, bow, listOfDocs, lift, length = None):   
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow,
                    min_support= support,
                    min_confidence= confidence,
                    min_lift= lift,
                    max_length= length)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift',
                               'Count'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
    Count = []
    tfidf = []
      
    maxCount = 0
    tfidfAverage = 0
    
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            #print("count:", len(ordered_stat.items_base) + len(ordered_stat.items_add))
            #print("cons:", ordered_stat.items_add)
            if IsAntecedentDifferentFromConsequent(ordered_stat.items_base, ordered_stat.items_add):
                if maxCount < len(ordered_stat.items_base) + len(ordered_stat.items_add):
                    maxCount = len(ordered_stat.items_base) + len(ordered_stat.items_add)
                    
                consequences = list(ordered_stat.items_base)
                antecedent = list(ordered_stat.items_add)
                Support.append(RelationRecord.support)
                Antecedent.append(ordered_stat.items_base)
                Consequent.append(ordered_stat.items_add)
                Confidence.append(ordered_stat.confidence)
                Lift.append(ordered_stat.lift)
                Count.append(len(ordered_stat.items_base) + len(ordered_stat.items_add))
                tfidfsum = 0
                for c in consequences:
                    tfidfsum += GetWordTFIDFMeasure(c, docsAsString)
                    
                for a in antecedent:
                    tfidfsum += GetWordTFIDFMeasure(a, docsAsString)
                
                tfidfAverage =  tfidfsum/maxCount
                
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    df['Count'] = Count
    df['TF-IDF Average'] = tfidfAverage
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
            
    display(HTML(df.to_html()))

In [6]:
from nltk.stem import LancasterStemmer

def SameStem(word1, word2, printStemmedWords = False):
    lancaster = LancasterStemmer()
    if printStemmedWords:
        print(lancaster.stem(word1), lancaster.stem(word2))

    return lancaster.stem(word1) == lancaster.stem(word2)

In [7]:
def IsAntecedentDifferentFromConsequent(Antecedent, Consequent):
    for ant in Antecedent:
        for cons in Consequent:
            if SameStem(ant, cons):
                return False
        
    return True
    

# Create the ngrams and set the ngram thresholds
min_count – Ignore all words and bigrams with total collected count lower than this value.
threshold – Represent a score threshold for forming the phrases. A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold. Heavily depends on concrete scoring-function, see the scoring parameter.


In [8]:
import gensim
from gensim.models.phrases import Phrases,Phraser

def ngrams(words, minimumCount=5, threshold=15):
    bigram = Phrases(words,
                     min_count=minimumCount,
                     threshold=threshold)
    
    trigram = Phrases(bigram[words],
                      threshold=threshold)  

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod


# TF-IDF WORD CHECK
Gets the TF-IDF value for a single word

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import numpy as np

def GetWordTFIDFMeasure(wordToFind, docsAsString):

    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docsAsString)

    tfidf = tfidf_vectorizer_vectors.todense()

    tfidf[tfidf == 0] = np.nan

    means = np.nanmean(tfidf, axis=0)

    means = dict(zip(tfidf_vectorizer.get_feature_names(), means.tolist()[0]))

    tfidf = tfidf_vectorizer_vectors.todense()

    ordered = np.argsort(tfidf*-1)
    words = tfidf_vectorizer.get_feature_names()

    for i, doc in enumerate(docsAsString):
        result = { }
        for t in range(len(doc)):
            if(words[ordered[i,t]] == wordToFind):
                return means[words[ordered[i,t]]]

# KMEANS CLUSTERING
https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans

In [10]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd

def GetTopicsUsingTFIDFAndKMeansClustering(data, numberOfClusters, topNumberOfTerms):
    tfidf = TfidfVectorizer(use_idf=True)
    tfidf.fit(docsAsString)
    text = tfidf.transform(docsAsString)
    clusters = MiniBatchKMeans(n_clusters=10).fit_predict(text)
    
    df = pd.DataFrame(text.todense()).groupby(clusters).mean()

    labels = tfidf.get_feature_names()
    
    results = []
    for i,r in df.iterrows():
        topic = [labels[t] for t in np.argsort(r)[-topNumberOfTerms:]]
        
        #results.append(','.join([labels[t] for t in np.argsort(r)[-topNumberOfTerms:]]))
    
        results.append(topic)
    
    return results
    
    

# Get the qualifying articles from the metadata
1. Qualifies if the publication date is within a given date range 
2. Has a pmc jcon file associated

In [11]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate, path):
    listofarticles = []
    try:
        with open(path + 'metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue 
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(article(row['publish_time'], json_path))
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

# Get the textbodies from the list of pdfs
1. Remove all special characters

In [12]:
import json
import re

def GetTextBodies(listOfpdfs, path):    
    text = []    
    
    medicalWords = GetMedicalTerms("vocab.txt")

    for pdf in listOfpdfs:
        filePath = path + pdf.Information.replace(" ", "")

        if not os.path.exists(filePath):
            continue
        
        with open(filePath) as f_json:
            full_text_dict = json.load(f_json)
            textBody = []
            for paragraph_dict in full_text_dict['body_text']:          
                paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])   
                paragraph_text = paragraph_text.lower()
                
                pdf.Information = paragraph_text
    return listOfpdfs

# Do the data pre-processing
1. remove stop words
2. lower case all words
3. Check to see if the word is within the list of medical terms given

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer

def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stop_words = set(stopwords.words('english'))
  
    resultDocs = []
    words = set(nltk.corpus.words.words())
    medicalWords = GetMedicalTerms("vocab.txt")
    medicalStopWords = GetMedicalTerms("clinical-stopwords.txt")
    
    for doc in listOfDocs:
        result = []
        for word in doc.Information.split(' '):  
            
            if word in "" or len(word) <= 3 or word in stop_words or word not in medicalWords or word in medicalStopWords:
                continue
                
            result.append(word)
                
        doc.Information = result
    return listOfDocs

# Create the corpus
1. Create the dictionary with all words and word ids
2. Create the bi,tri, and quadgrams if applicable
3. remove extreme occurences of words

In [14]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):

    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
       
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.filter_extremes(no_below=10, no_above=0.90, keep_tokens=['covid', 'coronavirus','sarscov'])
    
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

# Get all the topics from the dynamic topic model
1. Print the top 20 words

In [15]:
def GetAllTopicsFromModel(dtm):
    topics = dtm.print_topics(3)
    #print(topics)
    
    for topic in topics:
        print("word:", topic[0]);

# Get all qualifying topics generated from LDA model
1. Get all topics from the LDA model
2. Per each topic, check to see if the coherence score is greater than the lower bounce given
3. Per word, check to see if the word or a related word exists in the current topic, if it does, do not add
4. Per word, check to see if the probablity that the word is in the current topic is greater than the lower bound given
5. Do not consider topics with only one qualifying word

Notes: can read about coherence scores here: http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf


In [16]:
import datetime
def GetInformation(minProbablity, startDate, endDate, path):
    
    print('Starting', datetime.datetime.now())
    listOfpdfs= GetData(startDate, endDate,path)
    
    if len(listOfpdfs) == 0:
        print("No PDFs found under this topic")
        exit
        
    print(len(listOfpdfs))
    
    print('Finished gathering data', datetime.datetime.now())
    data = GetTextBodies(listOfpdfs, path)
    data.sort(key=lambda x: x.Date, reverse=False)
    print('Finished sorting data', datetime.datetime.now())
    cleaned_data = CleanTheData(data)  
    print('Finished cleaning data', datetime.datetime.now())  

    justDoc = []
    for datum in cleaned_data:
        justDoc.append(datum.Information)
        
        
    
    corpus, id2word, dictionary = ConvertDataToCorpus(justDoc) 
    print('Finished converting data to corpus', datetime.datetime.now())  
    
    dtm = GetTopicModel(corpus,
                      id2word, 
                      dictionary,
                      numberOfTopics = 50,
                      chunkSize=2000)  
    print('Finished creating LDA model', datetime.datetime.now())  
    
    GetAllTopicsFromModel(dtm)
    
#    topics = GetTopicTerms(searchTerm,
#                           lda = lda, 
#                           id2word = id2word,
#                           cleaned_data=cleaned_data, 
#                           lowerEndCoherenceScore=0, 
#                           numberOfWords=30, 
#                           minimumprobablity= minProbablity,
#                           windowSize= 15,
#                           processes=10)
    
    print('Finished creating DTM model', datetime.datetime.now())  
    
 #   PrintRuleAssociation(lda, 
 #                        support=0.1,
 #                        confidence=0.8,
 #                        bow=topics, 
 #                        lift = 1, 
 #                        length = None)
 #   print('Finished', datetime.datetime.now()) 



In [17]:
def PrintTopicsInTimeSlice(index, dtm, docsAsString):
    dtmResults = dtm.print_topics(index,top_terms=10)  

    dtmJustTheWords = []

    for r in dtmResults:
        words = {}
        av = 0
        for word in r:
            #print(word)
            if "_" in word[0]:
                continue
            
            tfidf = GetWordTFIDFMeasure(word[0], docsAsString)
            print(word[0], tfidf)
            if tfidf is None:
                continue
            av += tfidf
        print('average TF-IDF: ', av/10)
        print('\n')

In [18]:
def GetRulesFromTFIDFClusters(start, end, cleaned_data):

    docsAsString = []

    for datum in cleaned_data:
        if GetArticlesFromDateRange(start,end, datum.Date):
            docsAsString.append(' '.join(str(info) for info in datum.Information))
                
    t = GetTopicsUsingTFIDFAndKMeansClustering(docsAsString, 20,10)
    PrintRuleAssociation(support=0.175,
                         confidence=0.9,
                         bow=t, 
                         listOfDocs = docsAsString,
                         lift = 1, 
                         length = None)

#first month = 2330
#second month = 5992
#third month = 9234
#fourth month = 9486
#fifth =  8986
#sixth = 8470
#7th = 9178
#8th = 9325
#9th = 8368
#10th = 7284
#11th = 8857
#12th = 5059
#Finished gathering data 2021-03-16 21:17:36.279353

#Finished gathering data 2021-03-16 21:17:52.412954
#Finished gathering data 2021-03-16 21:20:53.203475


with stop word filter:
Finished gathering data 2021-03-16 21:55:08.355088
88349
Finished gathering data 2021-03-16 21:55:24.637484
Finished gathering data 2021-03-16 21:58:17.915549
Finished cleaning data 2021-03-16 22:31:32.614923

In [19]:
startDate = datetime.date(2020, 3, 1)
endDate = datetime.date(2020, 6, 1)
path = '/Users/josekowsky/Documents/2021-01-22/'

print('Starting', datetime.datetime.now())
listOfpdfs= GetData(startDate, endDate,path)
    
if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
        
print(len(listOfpdfs))
    
print('Finished gathering data', datetime.datetime.now())


Starting 2021-04-15 06:38:45.529910
17045
Finished gathering data 2021-04-15 06:39:09.360065


In [20]:
print('Finished sorting data', datetime.datetime.now())
data = GetTextBodies(listOfpdfs, path)
data.sort(key=lambda x: x.Date, reverse=True)
print('Finished sorting data', datetime.datetime.now())

Finished sorting data 2021-04-15 06:39:09.370353
Finished sorting data 2021-04-15 06:39:29.146157


In [21]:
print('Finished cleaning data', datetime.datetime.now()) 
cleaned_data = CleanTheData(data)  
print('Finished cleaning data', datetime.datetime.now())  

Finished cleaning data 2021-04-15 06:39:29.153910
Finished cleaning data 2021-04-15 06:46:11.285593


In [22]:

print('Finished converting data to corpus', datetime.datetime.now())  
justDoc = []
docsAsString = []
for datum in cleaned_data:
    justDoc.append(datum.Information)
    
    
    docsAsString.append(' '.join(datum.Information))
          
    
corpus, id2word, dictionary = ConvertDataToCorpus(justDoc) 
print('Finished converting data to corpus', datetime.datetime.now())  
    

Finished converting data to corpus 2021-04-15 06:46:11.293620
Finished converting data to corpus 2021-04-15 06:46:20.169294


In [23]:
print('Finished DTM', datetime.datetime.now())  
dtm = GetTopicModel(corpus,
                    id2word, 
                    dictionary,
                    numberOfTopics = 5,
                    chunkSize=2000)  

print('Finished DTM', datetime.datetime.now())  

Finished DTM 2021-04-15 06:46:20.176332


  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  convergence = np.fabs((bound - old_bound) / old_bound)
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
  converged = np.fabs((lhood_old 

Finished DTM 2021-04-15 08:29:22.821506


In [24]:
PrintTopicsInTimeSlice(0,dtm, docsAsString)
print('Finished DTM 0', datetime.datetime.now()) 
PrintTopicsInTimeSlice(1,dtm, docsAsString)
print('Finished DTM 1', datetime.datetime.now()) 
PrintTopicsInTimeSlice(2,dtm, docsAsString)
print('Finished DTM 2', datetime.datetime.now()) 


patients 0.1504107076480307
covid 0.1320474047154264
infection 0.13243188517335358
treatment 0.13986611443214986
risk 0.13214325918658423
severe 0.14337737668073447
transmission 0.15657144343291882
symptoms 0.15899537076589046
conclusion 0.148222635611599
evidence 0.14060499149695882
average TF-IDF:  0.14346711891436464


approach 0.1451953625303303
results 0.13393466303536294
method 0.17895648377631135
system 0.14663821456002155
performance 0.1782543318720085
problem 0.1756125754109563
framework 0.20322382569978176
process 0.15439505632161565
network 0.1916543720862092
average TF-IDF:  0.15078648852925974


conflict 0.7503063765229514
performed 0.19859214795717375
samples 0.20195982508384402
results 0.13393466303536294
testing 0.1685599737674308
number 0.13847628202711199
author 0.6028439872928887
review 0.24261539019356848
assay 0.279479070821539
relevant 0.2953331470922841
average TF-IDF:  0.3012100863794155


virus 0.14702349928322322
infection 0.13243188517335358
covid 0.132047404

NameError: name 'start' is not defined

In [25]:
startDate = datetime.date(2020, 3, 1)
endDate = datetime.date(2020, 4, 1)
GetRulesFromTFIDFClusters(startDate, endDate, cleaned_data)
print('Finished TFIDF 0', datetime.datetime.now()) 

startDate = datetime.date(2020, 4, 1)
endDate = datetime.date(2020, 5, 1)
GetRulesFromTFIDFClusters(startDate, endDate, cleaned_data)
print('Finished TFIDF 1', datetime.datetime.now()) 
    
startDate = datetime.date(2020, 5, 1)
endDate = datetime.date(2020, 6, 1)
GetRulesFromTFIDFClusters(startDate, endDate, cleaned_data)
print('Finished TFIDF 2', datetime.datetime.now()) 

Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count,TF-IDF Average
5,{author},"{conflict, relationships}",0.2,1.0,5.0,3,0.436179
8,"{conflict, relationships}",{author},0.2,1.0,5.0,3,0.436179
9,{relevant},"{conflict, reported}",0.2,1.0,5.0,3,0.436179
11,"{conflict, reported}",{relevant},0.2,1.0,5.0,3,0.436179
0,{author},{conflict},0.2,1.0,3.333333,2,0.436179
1,{author},{relationships},0.2,1.0,3.333333,2,0.436179
2,{relevant},{conflict},0.2,1.0,3.333333,2,0.436179
3,{personal},{relationships},0.2,1.0,3.333333,2,0.436179
4,{relevant},{reported},0.2,1.0,3.333333,2,0.436179
6,"{conflict, author}",{relationships},0.2,1.0,3.333333,3,0.436179


Finished TFIDF 0 2021-04-15 10:07:16.882336


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count,TF-IDF Average
4,{commercial},{relationships},0.2,1.0,5.0,2,0.520115
5,{relationships},{commercial},0.2,1.0,5.0,2,0.520115
6,{commercial},"{relationships, author}",0.2,1.0,5.0,3,0.520115
7,{relationships},"{author, commercial}",0.2,1.0,5.0,3,0.520115
8,"{author, commercial}",{relationships},0.2,1.0,5.0,3,0.520115
9,"{relationships, author}",{commercial},0.2,1.0,5.0,3,0.520115
0,{associated},{reported},0.2,1.0,3.333333,2,0.520115
1,{commercial},{author},0.2,1.0,2.5,2,0.520115
2,{conflict},{author},0.2,1.0,2.5,2,0.520115
3,{relationships},{author},0.2,1.0,2.5,2,0.520115


Finished TFIDF 1 2021-04-15 10:13:27.239126


Unnamed: 0,Left Hand Side,Right Hand Side,Support,Confidence,Lift,Count,TF-IDF Average
0,{relationships},{reported},0.2,1.0,5.0,2,0.353637
1,{reported},{relationships},0.2,1.0,5.0,2,0.353637


Finished TFIDF 2 2021-04-15 10:14:23.794330
