# Installs
1. pip install gensim
2. pip install apyori
3. pip install pandas
4. pip install nltk
5. pip install yellowbrick
6. pip install sentence_transformers
7. pip install scikit-learn-extra

In [1]:
def GetArticlesFromDateRange(startDate, endDate, dateToCheck):
    castedDateToCheck = datetime.datetime.strptime(dateToCheck, '%Y-%m-%d').date()
    if castedDateToCheck >= startDate and castedDateToCheck < endDate:
        return True
    
    return False

In [2]:
class article(object):
    def __init__(self, date, information):
        self.Date = date
        self.Information = information

# Got the medical documents from:
https://github.com/socd06/medical-nlp <br />
There's another tool someone made to get all medical terminologies on that link

In [3]:
def GetMedicalTerms(filePath): 
    
    lines = []
    with open(filePath) as f:
        for line in f:
            lines.append(line.strip().lower())
            
    return lines

# Initializing the LDA Model
https://radimrehurek.com/gensim/models/ldaseqmodel.html

In [4]:
from gensim.models import LdaSeqModel

def GetTopicModel(corpus, id2word,dictionary, numberOfTopics = 10, chunkSize=2000):

    temp = dictionary[0]  
    
    return LdaSeqModel(corpus=corpus,
                       time_slice=[2330, 5992, 9234],#,9486,8986,8470,9178,9325,8368,7284,8857,5059],
                       num_topics=numberOfTopics,
                       id2word=id2word,
                       chunksize=chunkSize)

# Get the rule associations

In [5]:
from apyori import apriori
from nltk.stem import WordNetLemmatizer, PorterStemmer
from datetime import datetime
import pandas as pd  
from IPython.display import HTML 

def PrintRuleAssociation(support, confidence, bow, listOfDocs, lift, length = None):   
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    if len(bow) == 0:
        print('No topics found')
        return
    
    rules = apriori(bow,
                    min_support= support,
                    min_confidence= confidence,
                    min_lift= lift,
                    max_length= length)
    
    results = list(rules)
    
    df = pd.DataFrame(columns=('Left Hand Side',
                               'Right Hand Side',
                               'Support',
                               'Confidence',
                               'Lift',
                               'Count'))
    
    Support =[]
    Confidence = []
    Lift = []
    Items = []
    Antecedent = []
    Consequent=[]
    Count = []
    tfidf = []
      
    maxCount = 0
    tfidfAverage = 0
    
    for RelationRecord in results:
        for ordered_stat in RelationRecord.ordered_statistics:
            #print("count:", len(ordered_stat.items_base) + len(ordered_stat.items_add))
            #print("cons:", ordered_stat.items_add)
            if IsAntecedentDifferentFromConsequent(ordered_stat.items_base, ordered_stat.items_add):
                if maxCount < len(ordered_stat.items_base) + len(ordered_stat.items_add):
                    maxCount = len(ordered_stat.items_base) + len(ordered_stat.items_add)
                    
                consequences = list(ordered_stat.items_base)
                antecedent = list(ordered_stat.items_add)
                Support.append(RelationRecord.support)
                Antecedent.append(ordered_stat.items_base)
                Consequent.append(ordered_stat.items_add)
                Confidence.append(ordered_stat.confidence)
                Lift.append(ordered_stat.lift)
                Count.append(len(ordered_stat.items_base) + len(ordered_stat.items_add))
                tfidfsum = 0
                for c in consequences:
                    tfidfsum += GetWordTFIDFMeasure(c, docsAsString)
                    
                for a in antecedent:
                    tfidfsum += GetWordTFIDFMeasure(a, docsAsString)
                
                tfidfAverage =  tfidfsum/maxCount
                
    df['Left Hand Side'] = list(map(set, Antecedent))
    df['Right Hand Side'] = list(map(set, Consequent))
    df['Support'] = Support
    df['Confidence'] = Confidence
    df['Lift'] = Lift
    df['Count'] = Count
    df['TF-IDF Average'] = tfidfAverage
    
    df.sort_values(by ='Lift', ascending = False, inplace = True)
            
    display(HTML(df.to_html()))

In [6]:
from nltk.stem import LancasterStemmer

def SameStem(word1, word2, printStemmedWords = False):
    lancaster = LancasterStemmer()
    if printStemmedWords:
        print(lancaster.stem(word1), lancaster.stem(word2))

    return lancaster.stem(word1) == lancaster.stem(word2)

In [7]:
def IsAntecedentDifferentFromConsequent(Antecedent, Consequent):
    for ant in Antecedent:
        for cons in Consequent:
            if SameStem(ant, cons):
                return False
        
    return True
    

# Create the ngrams and set the ngram thresholds
min_count – Ignore all words and bigrams with total collected count lower than this value.
threshold – Represent a score threshold for forming the phrases. A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold. Heavily depends on concrete scoring-function, see the scoring parameter.


In [8]:
import gensim
from gensim.models.phrases import Phrases,Phraser

def ngrams(words, minimumCount=5, threshold=15):
    bigram = Phrases(words,
                     min_count=minimumCount,
                     threshold=threshold)
    
    trigram = Phrases(bigram[words],
                      threshold=threshold)  

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod


# TF-IDF WORD CHECK
Gets the TF-IDF value for a single word

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import numpy as np

def GetWordTFIDFMeasure(wordToFind, docsAsString):

    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docsAsString)

    tfidf = tfidf_vectorizer_vectors.todense()

    tfidf[tfidf == 0] = np.nan

    means = np.nanmean(tfidf, axis=0)

    means = dict(zip(tfidf_vectorizer.get_feature_names(), means.tolist()[0]))

    tfidf = tfidf_vectorizer_vectors.todense()

    ordered = np.argsort(tfidf*-1)
    words = tfidf_vectorizer.get_feature_names()

    for i, doc in enumerate(docsAsString):
        result = { }
        for t in range(len(doc)):
            if(words[ordered[i,t]] == wordToFind):
                return means[words[ordered[i,t]]]

# KMEANS CLUSTERING
https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans

In [10]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd

def GetTopicsUsingTFIDFAndKMeansClustering(data, numberOfClusters, topNumberOfTerms):
    tfidf = TfidfVectorizer(use_idf=True)
    tfidf.fit(data)
    text = tfidf.transform(data)
    clusters = MiniBatchKMeans(n_clusters=10).fit_predict(text)
    
    df = pd.DataFrame(text.todense()).groupby(clusters).mean()

    labels = tfidf.get_feature_names()
    
    results = []
    for i,r in df.iterrows():
        topic = [labels[t] for t in np.argsort(r)[-topNumberOfTerms:]]
        
        #results.append(','.join([labels[t] for t in np.argsort(r)[-topNumberOfTerms:]]))
    
        results.append(topic)
    
    return results
    
    

# Get the qualifying articles from the metadata
1. Qualifies if the publication date is within a given date range 
2. Has a pmc jcon file associated

In [11]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path

def GetData(startDate, endDate, path):
    listofarticles = []
    try:
        with open(path + 'metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['pmc_json_files']:
                    continue 
            
                for json_path in row['pmc_json_files'].split(';'):
                    listofarticles.append(article(row['publish_time'], json_path))
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

# Get the textbodies from the list of pdfs
1. Remove all special characters

In [12]:
import json
import re

def GetTextBodies(listOfpdfs, path):    
    text = []    
    
    medicalWords = GetMedicalTerms("vocab.txt")

    for pdf in listOfpdfs:
        filePath = path + pdf.Information.replace(" ", "")
        
        if not os.path.exists(filePath):
            continue
        
        with open(filePath) as f_json:
            full_text_dict = json.load(f_json)
            #print(full_text_dict)

            textBody = ""
            for paragraph_dict in full_text_dict['body_text']:          
                #paragraph_text = re.sub(r'[^a-zA-Z_\s]+', '', paragraph_dict['text'])   
                paragraph_text = paragraph_dict['text'].lower()
                
                textBody += paragraph_text
                
            pdf.Information = textBody
                
    return listOfpdfs

# Do the data pre-processing
1. remove stop words
2. lower case all words
3. Check to see if the word is within the list of medical terms given

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize  
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem import LancasterStemmer


def CleanTheData(listOfDocs): 
    porter = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stop_words = set(stopwords.words('english'))
  
    resultDocs = []
    words = set(nltk.corpus.words.words())
    medicalWords = GetMedicalTerms("vocab.txt")
    medicalStopWords = GetMedicalTerms("clinical-stopwords.txt")
    
    for doc in listOfDocs:
        result = ""
        for word in doc.Information.split(' '):  
            
            if word in "" or len(word) <= 3 or word[0] == '\\' or word[0] == '{'or word[0] == '$' or word in stop_words:# or word not in medicalWords or word in medicalStopWords:
                continue
                
            result += porter.stem(word) + " "
                
        doc.Information = result
    return listOfDocs

# Create the corpus
1. Create the dictionary with all words and word ids
2. Create the bi,tri, and quadgrams if applicable
3. remove extreme occurences of words

In [14]:
from gensim.corpora.dictionary import Dictionary

def ConvertDataToCorpus(cleaned_data):

    dictionary = Dictionary(cleaned_data)
    bigramMod, trigramMod = ngrams(cleaned_data)
    
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
       
    id2word = gensim.corpora.Dictionary(ngram)
    id2word.filter_extremes(no_below=10, no_above=0.90, keep_tokens=['covid', 'coronavirus','sarscov'])
    
    id2word.compactify()
    
    corpus = [id2word.doc2bow(text) for text in ngram]
    
    return corpus, id2word, dictionary

# Get all the topics from the dynamic topic model
1. Print the top 20 words

In [15]:
def GetAllTopicsFromModel(dtm):
    topics = dtm.print_topics(3)
    #print(topics)
    
    for topic in topics:
        print("word:", topic[0]);

# Get all qualifying topics generated from LDA model
1. Get all topics from the LDA model
2. Per each topic, check to see if the coherence score is greater than the lower bounce given
3. Per word, check to see if the word or a related word exists in the current topic, if it does, do not add
4. Per word, check to see if the probablity that the word is in the current topic is greater than the lower bound given
5. Do not consider topics with only one qualifying word

Notes: can read about coherence scores here: http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
https://stackoverflow.com/questions/54762690/coherence-score-0-4-is-good-or-bad 
https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
https://levyomer.files.wordpress.com/2014/04/dependency-based-word-embeddings-acl-2014.pdf


In [16]:
import datetime
def GetInformation(minProbablity, startDate, endDate, path):
    
    print('Starting', datetime.datetime.now())
    listOfpdfs= GetData(startDate, endDate,path)
    
    if len(listOfpdfs) == 0:
        print("No PDFs found under this topic")
        exit
        
    print(len(listOfpdfs))
    
    print('Finished gathering data', datetime.datetime.now())
    data = GetTextBodies(listOfpdfs, path)
    data.sort(key=lambda x: x.Date, reverse=False)
    print('Finished sorting data', datetime.datetime.now())
    cleaned_data = CleanTheData(data)  
    print('Finished cleaning data', datetime.datetime.now())  

    justDoc = []
    for datum in cleaned_data:
        justDoc.append(datum.Information)
        
        
    
    corpus, id2word, dictionary = ConvertDataToCorpus(justDoc) 
    print('Finished converting data to corpus', datetime.datetime.now())  
    
    dtm = GetTopicModel(corpus,
                      id2word, 
                      dictionary,
                      numberOfTopics = 50,
                      chunkSize=2000)  
    print('Finished creating LDA model', datetime.datetime.now())  
    
    GetAllTopicsFromModel(dtm)
    
#    topics = GetTopicTerms(searchTerm,
#                           lda = lda, 
#                           id2word = id2word,
#                           cleaned_data=cleaned_data, 
#                           lowerEndCoherenceScore=0, 
#                           numberOfWords=30, 
#                           minimumprobablity= minProbablity,
#                           windowSize= 15,
#                           processes=10)
    
    print('Finished creating DTM model', datetime.datetime.now())  
    
 #   PrintRuleAssociation(lda, 
 #                        support=0.1,
 #                        confidence=0.8,
 #                        bow=topics, 
 #                        lift = 1, 
 #                        length = None)
 #   print('Finished', datetime.datetime.now()) 



In [17]:
def PrintTopicsInTimeSlice(index, dtm, docsAsString):
    dtmResults = dtm.print_topics(index,top_terms=10)  

    dtmJustTheWords = []

    for r in dtmResults:
        words = {}
        av = 0
        for word in r:
            #print(word)
            if "_" in word[0]:
                continue
            
            tfidf = GetWordTFIDFMeasure(word[0], docsAsString)
            print(word[0], tfidf)
            if tfidf is None:
                continue
            av += tfidf
        print('average TF-IDF: ', av/10)
        print('\n')

In [18]:
def GetRulesFromTFIDFClusters(start, end, cleaned_data):

    docsAsString = []

    for datum in cleaned_data:
        if GetArticlesFromDateRange(start,end, datum.Date):
            docsAsString.append(' '.join(str(info) for info in datum.Information))
                
    t = GetTopicsUsingTFIDFAndKMeansClustering(docsAsString, 20,10)
    PrintRuleAssociation(support=0.175,
                         confidence=0.9,
                         bow=t, 
                         listOfDocs = docsAsString,
                         lift = 1, 
                         length = None)

In [19]:
def plot_tsne_pca(data, labels):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=30000)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA(n_components=50).fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=300, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    ax[0].set_title('PCA Cluster Plot')
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot')

In [20]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))

In [21]:
def GetTextSummaryData(startDate, endDate, path):

    print('Starting', datetime.datetime.now())
    listOfpdfs= GetData(startDate, endDate,path)
    
    if len(listOfpdfs) == 0:
        print("No PDFs found under this topic")
        exit
        
    print(len(listOfpdfs))
    
    print('Finished gathering data', datetime.datetime.now())

    import copy
    data = GetTextBodies(listOfpdfs, path)
    keep_data = copy.deepcopy(data)
    
    print('Started cleaning data', datetime.datetime.now()) 
    cleaned_data = CleanTheData(data)  
    print('Finished cleaning data', datetime.datetime.now())  
    
    
    
    print('Finished converting data to corpus', datetime.datetime.now())  
    justDoc = []
    docsAsString = []
    # change to get list of indices needing to be removed and remove in decending order

    listOfIndices = []
    for n in range(len(cleaned_data)-1):
        #print(n)
        if len(cleaned_data[n].Information.split(' ')) < 20:
            listOfIndices.append(n) #data.pop(n)
        else:
            justDoc.append(re.sub(r'[^a-zA-Z_\s]+', '', cleaned_data[n].Information)  )      

    #corpus, id2word, dictionary = ConvertDataToCorpus(justDoc) 
    print('Finished converting data to corpus', datetime.datetime.now())  


    listOfIndices.sort(reverse = True)

    print(len(listOfIndices))
    for number in listOfIndices:
        print(number)
        data.pop(number)
    
    print('Finished removing short documents', datetime.datetime.now())  
       
    from sklearn.cluster import KMeans
    from yellowbrick.cluster import KElbowVisualizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(justDoc)

    model = KMeans()#, tol=0.0000001)

    visualizer = KElbowVisualizer(model, k=(2,20), timings=True)

    visualizer.fit(X)        # Fit the data to the visualizer

    print(visualizer.elbow_value_)   
    
    print('Finished clustering documents', datetime.datetime.now())  
    
    vectorizer.fit(justDoc)

    text = vectorizer.transform(justDoc)

    clusters = KMeans(n_clusters=visualizer.elbow_value_).fit_predict(X)
    get_top_keywords(text, clusters, vectorizer.get_feature_names(), 30)
       
    print('Finished getting tf-idf labels', datetime.datetime.now())
      
    from sklearn.metrics import pairwise_distances

    print(datetime.datetime.now())
    
    kmeansCluster = KMeans(n_clusters=visualizer.elbow_value_).fit(X)


    distances = pairwise_distances(kmeansCluster.cluster_centers_, X)
    ind = [np.argpartition(i, 3)[:3][::-1] for i in distances]
    closest = [X[indexes] for indexes in ind]
   
    print('Finished getting close documents to cluster centers', datetime.datetime.now()) 
    for x in ind:
        print(x, '\n')
        for y in x:
            print(keep_data[y].Information, '\n')
        
    
    print('Finished close documents', datetime.datetime.now()) 
    
    

#first month = 2330
#second month = 5992
#third month = 9234
#fourth month = 9486
#fifth =  8986
#sixth = 8470
#7th = 9178
#8th = 9325
#9th = 8368
#10th = 7284
#11th = 8857
#12th = 5059
#Finished gathering data 2021-03-16 21:17:36.279353

#Finished gathering data 2021-03-16 21:17:52.412954
#Finished gathering data 2021-03-16 21:20:53.203475


with stop word filter:
Finished gathering data 2021-03-16 21:55:08.355088
88349
Finished gathering data 2021-03-16 21:55:24.637484
Finished gathering data 2021-03-16 21:58:17.915549
Finished cleaning data 2021-03-16 22:31:32.614923

In [22]:
#GetTextSummaryData(datetime.date(2020, 3, 1),datetime.date(2020, 4, 1),'/Volumes/External HD/2022-02-07/')

In [23]:
#GetTextSummaryData(datetime.date(2020, 6, 1),datetime.date(2020, 7, 1),'/Volumes/External HD/2022-02-07/')

In [24]:
#GetTextSummaryData(datetime.date(2021, 1, 1),datetime.date(2021, 2, 1),'/Volumes/External HD/2022-02-07/')

In [25]:
#GetTextSummaryData(datetime.date(2021, 3, 1),datetime.date(2021, 4, 1),'/Volumes/External HD/2022-02-07/')

In [26]:
#GetTextSummaryData(datetime.date(2021, 6, 1),datetime.date(2021, 7, 1),'/Volumes/External HD/2022-02-07/')

In [27]:
#GetTextSummaryData(datetime.date(2022, 1, 1),datetime.date(2022, 2, 1),'/Volumes/External HD/2022-02-07/')



In [28]:
import datetime

startDate = datetime.date(2020, 3, 1)
endDate = datetime.date(2021, 3, 1)
path = '/Volumes/External HD/2022-02-07/'

print('Starting', datetime.datetime.now())
listOfpdfs= GetData(startDate, endDate,path)
    
if len(listOfpdfs) == 0:
    print("No PDFs found under this topic")
    exit
        
print(len(listOfpdfs))
    
print('Finished gathering data', datetime.datetime.now())


Starting 2022-03-12 19:08:49.299246
100846
Finished gathering data 2022-03-12 19:09:14.095567


In [29]:
#262461
print(listOfpdfs[0].Information)

document_parses/pmc_json/PMC7052093.xml.json


In [30]:
import copy
data = GetTextBodies(listOfpdfs, path)
keep_data = copy.deepcopy(data)

In [31]:
print('Started cleaning data', datetime.datetime.now()) 
cleaned_data = CleanTheData(data)  
print('Finished cleaning data', datetime.datetime.now())  

Started cleaning data 2022-03-12 19:11:19.170452
Finished cleaning data 2022-03-12 19:57:54.457065


In [32]:
cleaned_data.sort(key=lambda x: x.Date, reverse=True)
keep_data.sort(key=lambda x: x.Date, reverse=True)

In [33]:
print('Finished converting data to corpus', datetime.datetime.now())  
justDoc = []
docsAsString = []
# change to get list of indices needing to be removed and remove in decending order

listOfIndices = []
for n in range(len(cleaned_data)-1):
    #print(n)
    if len(cleaned_data[n].Information.split(' ')) < 10:
        listOfIndices.append(n) #data.pop(n)
    else:
        justDoc.append(re.sub(r'[^a-zA-Z_\s]+', '', cleaned_data[n].Information)  )      

#corpus, id2word, dictionary = ConvertDataToCorpus(justDoc) 
print('Finished converting data to corpus', datetime.datetime.now())  


Finished converting data to corpus 2022-03-12 19:57:54.921559
Finished converting data to corpus 2022-03-12 19:58:54.361792


In [34]:
listOfIndices.sort(reverse = True)

for number in listOfIndices:
    print(number)
    keep_data.pop(number)

100808
100806
100785
100769
100762
100760
100748
100722
100716
100703
100678
100670
100657
100651
100617
100613
100600
100519
100504
100462
100460
100441
100436
100371
100333
100320
100257
100188
100166
100139
100138
100081
100075
100057
100053
100025
100010
100009
99981
99895
99883
99835
99822
99816
99808
99790
99785
99738
99690
99686
99665
99619
99615
99610
99606
99604
99533
99528
99508
99483
99475
99468
99464
99317
99314
99303
99299
99201
99143
99113
99106
99087
99072
99060
99054
99019
98906
98871
98823
98794
98755
98752
98748
98683
98675
98670
98665
98664
98662
98655
98599
98595
98591
98578
98549
98525
98460
98457
98455
98436
98418
98411
98395
98371
98370
98361
98358
98327
98300
98277
98276
98256
98255
98249
98246
98240
98230
98213
98190
98177
98175
98169
98168
98162
98161
98132
98122
98115
98106
98102
98088
98086
98067
98044
98035
98030
98029
98023
98019
98007
97981
97930
97892
97887
97883
97876
97860
97850
97821
97787
97761
97752
97747
97744
97741
97737
97724
97716
97689
97659
97

57088
57047
56952
56864
56863
56824
56794
56776
56742
56728
56700
56686
56610
56600
56595
56527
56508
56479
56475
56440
56439
56432
56431
56391
56371
56311
56288
56243
56233
56156
56110
56096
56060
56057
56024
55997
55977
55952
55898
55894
55835
55759
55719
55546
55512
55490
55415
55410
55403
55396
55300
55284
55280
55245
55242
55238
55169
55162
55127
55106
55087
55076
54974
54965
54911
54906
54872
54869
54798
54763
54585
54583
54338
54274
54270
54216
54208
54202
54158
54148
54145
54144
54015
53997
53982
53936
53869
53859
53800
53713
53712
53546
53484
53467
53400
53376
53351
53335
53289
53276
53237
53196
53081
53003
53002
52922
52919
52896
52823
52733
52707
52661
52659
52656
52579
52567
52544
52513
52492
52416
52414
52395
52323
52190
52182
52173
52166
52155
52121
52076
51987
51965
51954
51853
51835
51786
51783
51730
51713
51682
51637
51585
51542
51506
51499
51468
51453
51397
51362
51352
51317
51296
51289
51219
51166
51155
51146
51119
51076
51049
51047
51007
51004
50882
50879
50873
5086

In [35]:
print(len(listOfIndices))

2922


In [None]:
print('Converting data to corpus Started', datetime.datetime.now())  
c = []
for datum in cleaned_data:
    c.append(datum.Information.split(' '))

corpus, id2word, dictionary = ConvertDataToCorpus(c) 
print(corpus[0])
print('Converting data to corpus Ended', datetime.datetime.now())  

In [None]:
print('Kmeans started', datetime.datetime.now()) 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(justDoc)
distortions = []
K = range(2,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k)
    kmeanModel.fit(X)
    print(kmeanModel.inertia_)
    distortions.append(kmeanModel.inertia_)
    
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
print('Kmeans Ended', datetime.datetime.now()) 

93138.32143135884
92015.58617199177
91295.82636503247
90813.94407996573
90522.70958229611
90213.61363953436
89962.98583621046
89665.71079006679


92003.66571725105
90550.52568512512
89842.1898318387
89367.18884255717
89079.56905941298
88797.46062755861
88507.39039362066
88247.09173319419

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import KMeans

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(justDoc)

clusters = KMeans(5).fit_predict(X)

#plot_tsne_pca(X, clusters)




In [None]:
vectorizer.fit(justDoc)

text = vectorizer.transform(justDoc)

get_top_keywords(text, clusters, vectorizer.get_feature_names(), 30)








In [None]:
from sklearn.metrics import pairwise_distances

kmeansCluster = KMeans(n_clusters=5).fit(X)

print(datetime.datetime.now())

distances = pairwise_distances(kmeansCluster.cluster_centers_, X)
ind = [np.argpartition(i, 3)[:3][::-1] for i in distances]
closest = [X[indexes] for indexes in ind]
ind

In [None]:
for x in ind:
    print(x, '\n')
    for y in x:
        print(keep_data[y].Information, '\n')
        
    print('\n')
        

In [None]:
keep_data[5].Information




In [None]:

pip install --upgrade tensorflow


In [None]:
Summarize("dear dr concerned about coronavirus: as of march 17, a total of 179,111 confirmed covid-19 cases have been reported to the world health organization, 3503 of which are within the united states.1 the coronavirus has broad virulence and a 14-day latent period, making risk of viral transmission and subsequent illness high.2 in the outpatient setting, dermatologists are challenged with upholding seemingly competing professional duties. for example, triaging a patient who requires urgent in-person evaluation but is at high-risk of covid-19 transmission or illness illustrates the current moral dilemma facing dermatologists. values also conflict when the very measures that protect staff and others from infection threaten employee salary and practice solvency.dermatologists at west china hospital, located in a province hard-hit by 2019-ncov, initially closed outpatient clinics and cancelled elective operations.3 as the pandemic progressed, they resumed outpatient office visits and operations on a case-by-case b", 'sshleifer/distilbart-cnn-6-6', maxLength = 125, minLength = 10, lengthPenalty = 2.0, beams = 4)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import datetime
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

document = "dear dr concerned about coronavirus: as of march 17, a total of 179,111 confirmed covid-19 cases have been reported to the world health organization, 3503 of which are within the united states.1 the coronavirus has broad virulence and a 14-day latent period, making risk of viral transmission and subsequent illness high.2 in the outpatient setting, dermatologists are challenged with upholding seemingly competing professional duties. for example, triaging a patient who requires urgent in-person evaluation but is at high-risk of covid-19 transmission or illness illustrates the current moral dilemma facing dermatologists. values also conflict when the very measures that protect staff and others from infection threaten employee salary and practice solvency.dermatologists at west china hospital, located in a province hard-hit by 2019-ncov, initially closed outpatient clinics and cancelled elective operations.3 as the pandemic progressed, they resumed outpatient office visits and operations on a case-by-case b"
minLength =10
lengthPenalty = 2.0
beams = 4


embedding = "sshleifer/distilbart-cnn-6-6"
model = AutoModelForSeq2SeqLM.from_pretrained(embedding)
tokenizer = AutoTokenizer.from_pretrained(embedding)
maxLength = 1024
inputs = tokenizer.encode("summarize: " + document, return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(inputs, max_length=maxLength, min_length=minLength, length_penalty=lengthPenalty, num_beams=beams, early_stopping=True)

print(tokenizer.decode(outputs[0]))

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")

print(summarizer(document, max_length=130, min_length=30, do_sample=False))

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer("summarize: " + document, return_tensors="tf", max_length=512)
outputs = model.generate(
    inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
)

print(tokenizer.decode(outputs[0]))

In [None]:
conda update -n base -c defaults conda