In [None]:
import pytest
from click.testing import CliRunner
import sys, os
import yake
import pandas as pd
from tika import parser # pip install tika
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
import re
import glob
import numpy as np
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import LdaModel
from operator import itemgetter
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer
import nltk as nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from gensim import models
from gensim.models.coherencemodel import CoherenceModel
import pprint

In [None]:
# myPath = os.path.dirname(os.path.abspath(__file__))
# sys.path.insert(0, myPath + '/../')
# print(myPath)

## Load PDFs and convert to Pandas Dataframe

In [None]:
directory = "practice_pdfs"
files = list(glob.glob(os.path.join(directory,'*.*')))
print(files)
#https://stackoverflow.com/questions/34000914/how-to-create-a-list-from-filenames-in-a-user-specified-directory-in-python
#https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
#https://stackoverflow.com/questions/33912773/python-read-txt-files-into-a-dataframe

In [None]:
#https://stackoverflow.com/questions/34837707/how-to-extract-text-from-a-pdf-file

document_list = []
for f in files:
    raw = parser.from_file(f)
    document_list.append(raw)

# print(document_list)


# raw = parser.from_file('CIR.0000000000000749.pdf')
# # print(raw['content'])
# # print(type(raw))
# print(raw.keys())
# metadata = raw["metadata"]
# content = raw["content"]
# # print(metadata)
# # print(content)
# print(type(content))



In [None]:
text_df = pd.DataFrame(document_list)
# text_df.head()
# print(text_df["content"][1])

## Extracting Keywords with TF-IDF and Python’s Scikit-Learn

In [None]:
#https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn

def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("","",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text


text_df['content'] = text_df['content'].apply(lambda x:pre_process(x))

#show the second 'text' just for fun
text_df['content'][1]


In [None]:
def get_stop_words(stop_file_path):
#     """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

#load a set of stop words
stopwords=get_stop_words("stop_words.txt")

In [None]:
#get the text column 
docs=text_df['content'].tolist()

#create a vocabulary of words, 
#ignore words that appear in 85% of documents, 
#eliminate stop words
#imit our vocabulary size to 10,000
cv=CountVectorizer(max_df=0.85,stop_words=stopwords,max_features=10000)
word_count_vector=cv.fit_transform(docs)

In [None]:
list(cv.vocabulary_.keys())[:10]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
# you only needs to do this once, this is a mapping of index to 
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs[0]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Doc=====")
print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


## Topic Modeling using Gensim LDA Library on Stemmed Tokens

Topic modeling is a technique for taking some unstructured text and automatically extracting its common themes, it is a great way to get a bird's eye view on a large text collection. 

Gensim = “Generate Similar” is a popular open source natural language processing library used for unsupervised topic modeling. 

The Gensim library uses a popular algorithm for doing topic model, namely Latent Dirichlet Allocation. Latent Dirichlet Allocation (LDA). LDA requires documents to be represented as a bag of words (for the gensim library, some of the API calls will shorten it to "bow"). This representation ignores word ordering in the document but retains information on how many times each word appears.

The main distinguishing feature for LDA is it allows for mixed membership, which means that each document can partially belong to several different topics. Note that the vocabulary probability will sum up to 1 for every topic, but often times, words that have lower weights will be truncated from the output.

Text modified from: 
* <https://notebook.community/ethen8181/machine-learning/clustering/topic_model/LDA>
* <https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py>
* <https://www.tutorialspoint.com/gensim/index.htm>


In [None]:
##Pre-process the text by making all terms lower case, remove special characters and numbers
##Code from: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn

def pre_process(text):
    
    ##lowercase
    text=text.lower()
    
    ##remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ## remove special characters and space, but leave in periods and numbers
    #text=re.sub('[^A-Za-z0-9.]+|\s',' ',text)
    
    ##remove tags
    #text=re.sub("","",text)
    
    ##Remove Emails
    #text=re.sub('\S*@\S*\s?', '', text) 

    ##Remove new line characters
    #text=[re.sub('\s+', ' ', text)

    ##Remove distracting single quotes
    #text=[re.sub("\'", "", text) 

    return text


text_df['preprocess'] = text_df['content'].apply(lambda x:pre_process(x))

# list(text_df.columns)
#show the second 'text' just for fun
# text_df['preprocess'][1]


In [None]:
##Then break the document text into tokens, remove the stopwords, and stem the tokens
##Code from: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn
doc_token_list=[]
    
def tokenize_stem(documents):
    
    ##Create PorterStemmer
    ##The better stemmer is the SnowballStemmer for English
    ##https://www.nltk.org/howto/stem.html
    ##p_stemmer = PorterStemmer()
    
    
    ##Create SnowballStemmer
    sb_stemmer = SnowballStemmer("english")
    

    ##Open stop words text file and save to stop_set variable
    with open("stop_words.txt", 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        f.close()

    ##Stopword list comes from the Terrier pacakge with 733 words and another 86 custom terms: 
    ##https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt
    ##https://github.com/kavgan/stop-words/blob/master/minimal-stop.txt
    
    ##Other stopword list options can be reviewed here:
    ##https://medium.com/towards-artificial-intelligence/stop-the-stopwords-using-different-python-libraries-ffa6df941653


    for doc in documents.dropna():

        # Tokenize documents by splitting into words using NLTK's word_tokenize 
        token_list = nltk.word_tokenize(doc)

        # Remove stop words from token_list
        token_nostop_list = [i for i in doc if not i in stop_set]
        
        # Use Porter Stemmer to stem tokens to create more like-words
        #token_stem_list = [p_stemmer.stem(i) for i in token_nostop_list if len(i) > 3]
            
        # Use Snowball Stemmer to stem tokens to create more like-words
        token_stem_list = [sb_stemmer.stem(i) for i in token_nostop_list if len(i) > 3]
            
        #Append token_stem_list to the doc_token_list
        doc_token_list.append(token_stem_list)


    return doc_token_list

tokenize_stem(text_df['preprocess'])
# print(type(doc_token_list))
# print(doc_token_list[1])


In [None]:
##Run the gensim topic modeling and return the topics
##Code from: https://notebook.community/ethen8181/machine-learning/clustering/topic_model/LDA

def get_gensim_corpus_dictionary(data):
    ##If content is not yet a list, make it a list and build the id2word dictionary and the corpus (map the word to id)
    ##texts = text_df['content'].apply(lambda x: x.split(' ')).tolist()
    ##print(texts)

    ##Build the id2word dictionary and the corpus
    ##The dictionary associates each word in the corpus with a unique integer ID
    dictionary = corpora.Dictionary(data)
    print('Number of unique tokens: ', len(dictionary))

    ## Filter out words that appear in less than 2 documents (appear only once),
    dictionary.filter_extremes(no_below = 2)

    ## Filter out words that appears in more than certain % of documents
    ## no_above = 0.5 would remove words that appear in more than 50% of the documents
    # dictionary.filter_extremes(no_above = 0.5)

    # Remove gaps in id sequence after words that were removed
    dictionary.compactify()
    print('Number of unique tokens used 2 or more times: ', len(dictionary))

    ##Use code below to print terms in dictionary with their IDs
    ##This will show you the number of the terms in the dictionary
    #print("Dictionary Tokens with ID: ")
    #pprint.pprint(dictionary.token2id)
    
    ##Map terms in corpus to words in dictionary with ID
    ##This will show you the ID of the term in the dictionary, and the number of times the terms occurs in the corpus
    bow_corpus = [dictionary.doc2bow(text) for text in data]
    #print("Tokens in Corpus with Occurrence: ")
    #pprint.pprint(corpus)
    
    ##Print word count by vector 
    id_words_count = [[(dictionary[id], count) for id, count in line] for line in bow_corpus]
    print("Word Count in each Vector: ")
    pprint.pprint(id_words_count[2])
    
     
    return bow_corpus, dictionary




bow_corpus, dictionary = get_gensim_corpus_dictionary(doc_token_list)



In [None]:
## Run the Gensim Library LDA Model
## See link below if you want to save and load a model
## https://notebook.community/ethen8181/machine-learning/clustering/topic_model/LDA

def run_gensim_LDA_model(corpus, dictionary):
    ##Directory for storing all lda models
    model_dir = 'lda_checkpoint'

    ##If model_dir directionry is not in the folder, then make the directory
    if not os.path.isdir(model_dir):
        os.mkdir(model_dir)

    ##Load the model if we've already trained it before
   
    path = os.path.join(model_dir, 'topic_model.lda')
    if not os.path.isfile(path):
        ##Training LDA can take some time, we could set eval_every = None to not evaluate the model perplexity
        ##Other parameters for LdaModel, include: random_state=100, update_every=1,chunksize=100,passes=10,alpha='auto',per_word_topics=True
        topic_model = LdaModel(corpus, id2word = dictionary, num_topics = 3, iterations = 200)
        topic_model.save(path)
 
    topic_model = LdaModel.load(path)

    # Each element of the list is a tuple containing the topic and word / probability list
    topics = topic_model.show_topics(num_words = 10, formatted = False)

    print(type(topics))
    
    ##Model perplexity and topic coherence provide a convenient measure to judge how good a given topic model is. 
    ##In my experience, topic coherence score, in particular, has been more helpful.
    #https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#:~:text=Topic%20Modeling%20is%20a%20technique,in%20the%20Python's%20Gensim%20package.

    ## Compute Perplexity
    print('\nPerplexity: ', topic_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    ## Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=topic_model, texts=corpus, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    
    
    return topics

run_gensim_LDA_model(bow_corpus, dictionary)

In [None]:
# Save topics to CSV

def create_topic_CSV(topics):
    
    ##Create dataframe for topics
    df_topics = pd.DataFrame(topics, columns = ['TopicNum', 'Terms'])
    #df_topics.head()

    ## Save dataframe to csv
    with open(r"topic_modeling.csv", 'w', encoding='utf-8') as file:
        df_topics.to_csv(file)
        file.close()
    
create_topic_CSV(topics)

In [None]:
## Run the Gensim Library TFIDF Model 
##The words that will occur more frequently in the document will get the smaller weights.
##https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html#sphx-glr-auto-examples-core-run-core-concepts-py
##new_list = []

tfidf_frequency = []

def run_gensim_tfidf_model(corpus, dictionary): 
    
    ##Initialize the tf-idf model, training it on our corpus 
    tfidf = models.TfidfModel(corpus)
    
    ##if working with a new document, you can get tfidf from the model
    #new_doc = "abbott bra adolesc".lower().split()
    #print(new_doc)
    #new_list.append(tfidf[dictionary.doc2bow(new_doc)])
    
    corpus_tfidf = tfidf[corpus]
    for doc in corpus_tfidf:
        ##pprint.pprint(doc)
        tfidf_frequency.append(doc)
    
    #Print word frequencies by vector 
    id_words_frequency = [[(dictionary[id], frequency) for id, frequency in line] for line in tfidf_frequency]
    print("Word Frequency by Vector: ")
    pprint.pprint(id_words_frequency[2])
    
run_gensim_tfidf_model(bow_corpus, dictionary)

#pprint.pprint(tfidf_frequency)
    

## Topic Modeling using Gensim LDA Library on Lemmatized Tokens

In [None]:
##Pre-process the text by making all terms lower case, remove special characters and numbers
##Code from: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.X7RHltBKiUn

data_pre_process = []

def preprocess_data(data): 
    
    for email in data:
        
        ##lowercase
        text_lower=email.lower()
        
        ##Remove Emails
        text_email=re.sub('\\S*@\\S*\\s?', '', text_lower) 
        
        ##remove special characters and digits
        text_special=re.sub("(\\d|\\W)+"," ",text_email)
        
  
        data_pre_process.append(text_special)
    
    return data_pre_process

preprocess_data(data)
print(data_pre_process[1])

In [None]:
data_words = []

def tokenize(documents):
    for doc in documents:
        token_list = gensim.utils.simple_preprocess(str(doc), deacc=True)  # deacc=True removes punctuations
        data_words.append(token_list)
    return data_words


tokenize(data_pre_process)
# print(type(data_words))
print(data_words[1])

In [None]:
def built_bigram_trigram_models(documents):
    
   
    ##Building Bigram & Trigram Models
    ##higher threshold fewer phrases.
    bigram = gensim.models.Phrases(documents, min_count=5, threshold=100)
    trigram = gensim.models.Phrases(bigram[documents], threshold=100)
        
    ##Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
          
    ##See trigram example
    print(trigram_mod[bigram_mod[doc[0]]])
        
    return bigram_mod, trigram_mod
        
bigram_mod, trigram_mod = built_bigram_trigram_models(data_words)
 

In [None]:
doc_no_stop_list = []

def remove_stop_words(documents):

    ##Open stop words text file and save to stop_set variable
    with open("stop_words.txt", 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        f.close()

    ##Stopword list comes from the Terrier pacakge with 733 words and another 86 custom terms: 
    ##https://github.com/kavgan/stop-words/blob/master/terrier-stop.txt
    ##https://github.com/kavgan/stop-words/blob/master/minimal-stop.txt
    
    ##Other stopword list options can be reviewed here:
    ##https://medium.com/towards-artificial-intelligence/stop-the-stopwords-using-different-python-libraries-ffa6df941653

    for doc in documents:
        
        # Remove stop words from doc in documents
        token_no_stop_list = [i for i in doc if not i in stop_set]
        
        #Append token_stem_list to the doc_token_list
        doc_no_stop_list.append(token_no_stop_list)
    
    return  doc_no_stop_list 
            
remove_stop_words(doc_token_list)


In [None]:
bigram_list = []

def make_bigrams(documents, bigram_mod):
    
    bigram_list = [bigram_mod[doc] for doc in documents]
        
    return bigram_list

make_bigrams(doc_no_stop_list, bigram_mod)

In [None]:
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]