Tests for this notebook:

1. Check to see if the keywords in each document correspond with a certain topic
2. Check to see if there's a difference between TF-IDF and BERT with this keyword extraction
3. Check to see if there's a difference using the different topic modeling


https://www.analyticsvidhya.com/blog/2021/07/topic-modeling-with-naive-bayes-classifier/


TODO tomorrow - Do the Kmeans first and elbow test and use that k value to determine lda topics and then compare documents in clusters


TODO:

1. Add Ngram
2. Add back data cleaning and stemming/Lemming
3. check the data and cosine simularity before 1 and 2 though

In [1]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))

In [2]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

from yellowbrick.cluster import KElbowVisualizer

def GetOptimalKMeans(X, maxK):
    model = KMeans(n_init=1000,random_state=3425, max_iter=3000, tol=0.001)
    visualizer = KElbowVisualizer(model, k=(2,maxK), metric='distortion', timings=False, locate_elbow=True)

    visualizer.fit(X)        # Fit the data to the visualizer
    visualizer.show()        # Finalize and render the figure
    
    print("Optimal K-value:", visualizer.elbow_value_)
    
#     visualizer = KElbowVisualizer(model, k=(2,maxK), metric='silhouette', timings=False)

#     visualizer.fit(X)        # Fit the data to the visualizer
#     visualizer.show()        # Finalize and render the figure
    
#     print("Optimal K-value:", visualizer.elbow_value_)
    return visualizer.elbow_value_

In [3]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Error loading wordnet: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>
[nltk_data] Error loading stopwords: <urlopen error [Errno 8] nodename
[nltk_data]     nor servname provided, or not known>


False

In [4]:
from collections import defaultdict
import csv
from pathlib import Path
import json
import os.path
from langdetect import detect

import re


def GetData(startDate, endDate, path):
    listofarticles = []
    try:
        with open(path + 'metadata.csv') as f_in:
            reader = csv.DictReader(f_in)
            for row in reader:
                if '-' not in row['publish_time']:
                    continue
                elif startDate > datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() or datetime.datetime.strptime(row['publish_time'], '%Y-%m-%d').date() > endDate:
                    continue
                if not row['abstract']:
                    continue 
                      
                try:
                    language = detect(row['abstract'])
                    if 'en' != language:
                        continue
                except:
                    continue
                filePath = path + row['pmc_json_files']
#                 if( not os.path.isfile(filePath)):
#                     continue
#                 with open(filePath) as ff_in:
#                     reader = csv.DictReader(f_in)
#                     full_text_dict = json.load(f_json
#                     print(reader)
                                                                                    
                introduction = ''    
                if row['pmc_json_files']:
                    for json_path in row['pmc_json_files'].split('; '):
                        with open(path + json_path) as f_json:
                            full_text_dict = json.load(f_json)

                            for paragraph_dict in full_text_dict['body_text']:
                                paragraph_text = paragraph_dict['text']
                                section_name = paragraph_dict['section']
                                introduction += paragraph_text
                if '\\' in introduction or introduction.strip() == "" or len(introduction.split()) < 50:
                    continue
                               
                listofarticles.append(re.sub(r'[^A-Za-z ]+', '', introduction.lower()))
                
    except ValueError:
        print("An error occurred: ", ValueError, " Please try again.")
    return listofarticles

In [5]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True) # initialize(36) or initialize(os.cpu_count()-1)

def tokenize_lemma_stopwords(text):
    tokens = []
    for token in nltk.tokenize.word_tokenize(text.lower()):
#         if(len(token) <= 3 and token in stopwords.words()):
#             continue
        if(len(token) <= 3 or token in stemmedStopWords):
            continue
        tokens.append(ps.stem(token))
            
    return tokens


def dataCleaning(data):
    data["content"] = data["content"].parallel_apply(tokenize_lemma_stopwords)
    return data

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
import gensim
from gensim.models.phrases import Phrases,Phraser

def ngrams(words, minimumCount=5, threshold=15):
    bigram = Phrases(words,
                     min_count=minimumCount,
                     threshold=threshold)
    
    trigram = Phrases(bigram[words],
                      threshold=threshold)  

    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)
    return bigram_mod, trigram_mod

In [7]:
def ConvertDataToCorpus(cleaned_data):

    print('Finished creating first dictionary', datetime.datetime.now())
    bigramMod, trigramMod = ngrams(cleaned_data)
    print('Finished initializing bi/tri grams', datetime.datetime.now())
    ngram =  [trigramMod[bigramMod[review]] for review in cleaned_data]
    print('Finished creating bi/tri grams', datetime.datetime.now())
       
    print('Finished creating first dictionary', datetime.datetime.now())
    id2word = gensim.corpora.Dictionary(cleaned_data)
    id2word.filter_extremes(no_below=100, no_above=0.5)
    
    id2word.compactify()
    
    bow = [id2word.doc2bow(text) for text in cleaned_data]
    
    print('Finished creating bag of words', datetime.datetime.now())
    
    return bow, id2word

In [None]:
import datetime
import copy
print(datetime.datetime.now())
startDate = datetime.date(2020,1, 1)
endDate = datetime.date(2021, 1, 1)
path = '/Volumes/External HD/2022-02-07/'


allpdfs=GetData(startDate, endDate,path)

print(datetime.datetime.now())

2022-08-15 13:46:00.441809


In [None]:
print(len(allpdfs))

In [None]:
stemmedStopWords = {}

for word in stopwords.words():
    stemmedStopWords[ps.stem(word)] = 0


In [None]:
# Create a dictionary for vocabulary words with it's index and count

print(datetime.datetime.now())
X  = pd.DataFrame(allpdfs, columns=["content"])

wordnet_lemmatizer = WordNetLemmatizer()

cleanedData = dataCleaning(X)

X = cleanedData["content"]

print(datetime.datetime.now())

In [None]:
bow_corpus, globaldictionary = ConvertDataToCorpus(X)

In [None]:
# stemmedStopWords = {}

# for word in stopwords.words():
#     stemmedStopWords[ps.stem(word)] = 0

# toRemove = []
# for k, v in globaldictionary.items():
#     if(v in stemmedStopWords):
#         toRemove.append[k]
#     else:     
#         gDictionary[v.lower()] = 0
        
# print(len(globaldictionary))
# globaldictionary.filter_tokens(bad_ids=toRemove)
# globaldictionary.compactify()
# print(len(globaldictionary))
        

In [None]:
#iterate through the new dictionary and remove stop words, short words, words that aren't alpha numeric and lemmatize
import nltk
# english_vocab = set(w.lower() for w in nltk.corpus.words.words())
# new_english_vocab = {}

# print('Creating english dictionary', datetime.datetime.now())
# for word in english_vocab:
#     new_english_vocab[ps.stem(word)] = 0

# print('Finished english dictionary', datetime.datetime.now())
gDictionary = {}
wordsToRemove = []
for k, v in globaldictionary.items():
#    if(v in new_english_vocab):
    gDictionary[v.lower()] = 0
        

print('Finished cleaning dictionary', datetime.datetime.now())

In [None]:
# toKeep = []
# for k, v in gDictionary.items():
#     toKeep.append(globaldictionary.token2id[k])

In [None]:
# print(len(globaldictionary))
# globaldictionary.filter_tokens(good_ids=toKeep)
# print(len(globaldictionary))

for k, v in globaldictionary.items():
    print(v)
    break

In [None]:
bow_corpus = [globaldictionary.doc2bow(doc) for doc in X]

2 -0.9257380753909639
3 -0.8924911617047728
4 -0.8770794330024948
5 -0.8871086585216401
6 -0.90274696657373
7 -0.9096775832232503
8 -0.8935286011309913
9 -0.9039521317399287
10 -0.8986779384062977
11 -0.9037140381713342
12 -0.9141724166730084
13 -0.9029035359997102
14 -0.911527758475253
15 -0.8950295260971027
16 -0.8968096791358515
17 -0.9066689359608383
18 -0.9053312826265838
19 -0.9105579896968713
20 -0.904133068759054
21 -0.9121805585867648
22 -0.9072281446347834
23 -0.9119270569951463
24 -0.9145402880275769
25 -0.9162855533727349
26 -0.9177669915896141
27 -0.9217723688770518
28 -0.9194462575804779
29 -0.9177366729866188

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora  

dirichlet_dict = globaldictionary
bow_corpus = [dirichlet_dict.doc2bow(text) for text in X]

# Considering 1-15 topics, as the last is cut off
num_topics = list(range(31)[1:])
num_keywords = 30

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    print(i)
    LDA_models[i] = LdaModel(corpus=bow_corpus,
                             id2word=dirichlet_dict,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(bow_corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
    
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()  
    

In [None]:
lda_model = CreateLdaModel(20)

lda_model.top_topics(bow_corpus)

In [None]:
#68

In [None]:
#Now get data for a time slice
print(datetime.datetime.now())
startDate = datetime.date(2020,1, 1)
endDate = datetime.date(2020, 2, 1)
path = '/Volumes/External HD/2022-02-07/'


listOfpdfs2=GetData(startDate, endDate,path)
print(datetime.datetime.now())

In [None]:

print(datetime.datetime.now())
# Create a dictionary for vocabulary words with it's index and count
XX  = pd.DataFrame(listOfpdfs2, columns=["content"])

wordnet_lemmatizer = WordNetLemmatizer()

cleanedData = dataCleaning(XX)

XX = cleanedData["content"]

print(datetime.datetime.now())


In [None]:
# XX = []
# count = 0 
# for l in cleanedData["content"]:
#     XX.append(l.split())

bow_corpus, dictionary = ConvertDataToCorpus(XX)



In [None]:
# # Create a dictionary for vocabulary words with it's index and count
# dictionary = gensim.corpora.Dictionary(XX)


# # filter words that occurs in less than 5 documents and words that occurs in more than 50% of total documents
# # keep top 100000 frequent words
# #dictionary.filter_extremes(no_below=5, no_above=0.5)


In [None]:
# #iterate through the new dictionary and remove stop words, short words, words that aren't alpha numeric and lemmatize
# print(dictionary[0])
# goodTokens = []
# goodTokensFromLocalDictionary = []
# for k, v in dictionary.items():
#     if v in stopwords.words():
#         continue
#     goodTokensFromLocalDictionary.append(v)       
        
# for k, v in gDictionary.items():
#     if(k in goodTokensFromLocalDictionary):
#         goodTokens.append(dictionary.token2id[k])
        
        

In [None]:
# print(len(dictionary))
# dictionary.filter_tokens(good_ids=goodTokens)
# print(len(dictionary))



In [None]:
print(len(dictionary))
print(len(gDictionary))

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in XX]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora  

dirichlet_dict = globaldictionary
bow_corpus = [dirichlet_dict.doc2bow(text) for text in XX]

# Considering 1-15 topics, as the last is cut off
num_topics = list(range(31)[1:])
num_keywords = 30

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    print(i)
    LDA_models[i] = LdaModel(corpus=bow_corpus,
                             id2word=dirichlet_dict,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(bow_corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]
    
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]

plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()  
    

In [None]:
lda_model2 = gensim.models.LdaModel(corpus=bow_corpus,
                                   id2word=dictionary,
                                   num_topics=2,
                                   chunksize=len(bow_corpus),
                                   alpha='auto',
                                   eta='auto',
                                   offset=0.75,
                                   passes=10,
                                   random_state=42)


In [None]:
lda_model2.show_topic(0, len(dictionary))

In [None]:
baseVector = []
for n in range(0,len(gDictionary)):
    baseVector.append(0) 

In [None]:
index = 0

# import copy
# d2 = copy.deepcopy(d)
timesliceDictionaries = []
for n in range(0,2):
    for word in lda_model2.show_topic(n, len(dictionary)):
        if word[0] in gDictionary:
            gDictionary[word[0]] = word[1]
    timesliceDictionaries.append(copy.deepcopy(gDictionary))

    for k, v in gDictionary.items():
        gDictionary[k] = 0
    

In [None]:
#make base vector from global dictionary
#loop over number of topics chosen from the whole corpus and initlaize the a vector of vectors
#loop over number of topics from the timeslice and intialize in vector of vectors
#nested for loop with both vectors and compare cosine simularity
#graph each point in a graph with the overall having a different color than 

len(timesliceDictionaries)
        

In [None]:
corpusDictionaries = []
for n in range(0,10):
    for word in lda_model.show_topic(n, len(gDictionary)):
        if word[0] in gDictionary:
            gDictionary[word[0]] = word[1]
    corpusDictionaries.append(copy.deepcopy(gDictionary))

    for k, v in gDictionary.items():
        gDictionary[k] = 0

In [None]:
list(corpusDictionaries[0].values())

In [None]:
from scipy import spatial
import pprint

results = []
for topic in corpusDictionaries:
    rowResults = []
    for topic2 in timesliceDictionaries:
        rowResults.append(1-spatial.distance.cosine(list(topic.values()), list(topic2.values())))
        #print(spatial.distance.cosine(list(topic.values()), list(topic2.values())))
    results.append(rowResults)
    #print('\n')
    
pprint.pprint(results)

In [None]:
print(gDictionary)