# Include-NLP-Functions
Define NLP functions used in the other notebooks

In [2]:
import gensim # used for the Mallet LDA model
from gensim.corpora import Dictionary, MmCorpus
#from gensim import corpora, models, similarities
from gensim import corpora, models
#from gensim.similarities import Similarity
from gensim.models.phrases import Phrases, Phraser

from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.word2vec import LineSentence # use when reading sentences from large files
from gensim.models import TfidfModel

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# For pre-trained GloVe word2vec models
## See https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
from gensim.scripts.glove2word2vec import glove2word2vec

# For pre-trained FastText word2vec models
## To be implemented
## See https://datascience.stackexchange.com/questions/20071/how-do-i-load-fasttext-pretrained-model-with-gensim
#from gensim.models.wrappers import FastText
#model = FastText.load_fasttext_format('wiki.simple')
#print(model.most_similar('teacher'))
# Output = [('headteacher', 0.8075869083404541), ('schoolteacher', 0.7955552339553833), ('teachers', 0.733420729637146), ('teaches', 0.6839243173599243), ('meacher', 0.6825737357139587), ('teach', 0.6285147070884705), ('taught', 0.6244685649871826), ('teaching', 0.6199781894683838), ('schoolmaster', 0.6037642955780029), ('lessons', 0.5812176465988159)]
#print(model.similarity('teacher', 'teaches'))
# Output = 0.683924396754


# From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

## Simple corpus-building functions

In [2]:
# Build a corpus from a list of strings
# Output is a list containing lists of tokens

def build_corpus_from_strings(listOfStrings, remove_words=[]):
    corpus = []
    for text in listOfStrings:
        # Strip the string of punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))
        # lowercase everything
        text = text.lower()
        doc = [token for token in text.split() if token not in remove_words]
        corpus.append(doc)
    
    return corpus

In [3]:
# Given the text occurring in text_field, returns a tokenized corpus
# with a list of documents, each document itself a list of tokens.
#### NOTE: NO CLEANING ####
def build_corpus(dataFrame, text_field, remove_words=[]):
    corpus = []
    for text in dataFrame[text_field]:
        doc = [token for token in text.split() if token not in remove_words]
        corpus.append(doc)
    
    return corpus

In [4]:
# Given tokenized lists, build it into one list -- i.e., a corpus
#### NOTE: The output will be a list of documents where each document is a list of tokens ####
#### Flatten it as needed to build a single list of tokens
def build_corpus_from_tokenized_lists(dataFrame, tokenized_text_field):
    t0 = time.time()
    corpus = list(itertools.chain(dataFrame[tokenized_text_field][0:]))
    t1 = time.time()
    print("Execution time = {} seconds.".format(round(t1-t0, 1)))
    return corpus

In [5]:
# The best way to create a corpus from a list of text strings, each text string being a document in the corpus
## Depends on the prep_doc function below
# Create a corpus using documents as input and the prep_doc function below
# A document is a text string, e.g. 'The fox jumped over the hen.'
def build_prepped_corpus(dataFrame, text_field):
    corpus = []
    doc = [prep_doc(text_string) for text_string in dataFrame[text_field]]
    corpus.append(doc)
    
    return corpus

In [6]:
#### LATEST - USE THIS ####
# Build a text corpus (a list of tokenized lists of words) given 
## a dataframe and a text field in that dataframe
def build_text_corpus(dataFrame, text_field):
    t0 = time.time()
    # Get the text field as a list of documents
    docs = list(dataFrame[text_field])
    t1 = time.time()
    print("Documents pulled from dataframe in {} seconds.".format(round(t1-t0, 2)))
    
    # Clean each document as needed
    t2 = time.time()
    print("Starting to clean documents...this can take a while....patience...")
    text_corpus = [clean_doc(doc) for doc in docs]
    t3 = time.time()
    print("Documents tokenized and cleaned in {} seconds.".format(round(t3-t2), 2))
    
    return text_corpus

In [7]:
# Get the descriptive stats on number of words in each document in the corpus
def get_corpus_stats(text_corpus):
    # text_corpus is a list of documnets; each document is a tokenized list
    # Flatten text_corpus to get one single list
    t0 = time.time()
    num_docs = len(text_corpus)
    text_corpus_flattened = flatten_list(text_corpus)
    total_words = len(text_corpus_flattened)
    doc_lengths = [len(doc) for doc in text_corpus]
    mean_word_count = round(np.mean(doc_lengths), 0)
    std_dev_word_count = round(np.std(doc_lengths), 0)
    max_word_count = np.max(doc_lengths)
    min_word_count = np.min(doc_lengths)
    
    print("Number of documents in the corpus = {}".format(num_docs))
    print("Total number of words in the corpus = {}.".format(total_words))
    print("Average number of words per document = {}.".format(mean_word_count))
    print("Std Dev of words per document = {}.".format(std_dev_word_count))
    print("Largest document has {} words.".format(max_word_count))
    print("Smallest document has {} words.".format(min_word_count))
    t1 = time.time()
    print("Execution time = {} seconds.".format(round(t1-t0, 3)))
    print("[Mean Word Count, Std Word Count, Max Word Count, Min Word Count]")
    return [mean_word_count, std_dev_word_count, max_word_count, min_word_count]


## Document cleaning functions

Jason Brownlee has a good [tutorial](https://machinelearningmastery.com/clean-text-machine-learning-python/) on cleaning text using plain Python or using NLTK.

 - Tokenize the document
 - Convert the tokens to lower case
 - Remove punctuation and clean up empty strings
 - Remove hex sequences and clean up empty strings
 - Optional cleaning
  - Remove standard English stop words
  - Remove additional words (given a list of these words)
  - Remove numbers
  - Remove additional characters (given a regex)
  - Lemmatize the tokens

In [8]:
# Text processing packages
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [9]:
# Tokenize a string and remove all whitespaces
def tokenize_text(text):
    # text is any string, e.g., 'The brown fox jumped over the quick hen! And then what?'
    # It can have any characters including punctuations, hex sequences, numerals, etc.
    
    # Use the NLTK tokenizer 
    try:
        tokens = nltk.word_tokenize(text)
    except TypeError:
        #print("TypeError")
        tokens = []
    except AttributeError:
        #print("AttributeError")
        tokens = []
        
    return tokens

In [10]:
# Remove empty strings from a tokenized list of strings (utility function)
def remove_empty_tokens(doc):
    # doc is any list of tokenized strings, e.g., the output of tokenize_text
    # Empty strings have truth value FALSE; hence non-empty strings are TRUE
    ## https://stackoverflow.com/questions/9573244/most-elegant-way-to-check-if-the-string-is-empty-in-python
    return [token for token in doc if token]

In [11]:
# Convert the tokens into lower case
def lower_case(doc):
    # doc is any tokenized list of strings
    return [token.lower() for token in doc]

In [12]:
#### DEPRECATED ####
# Remove punctuation
def remove_punctuation(doc):
    # doc is any tokenized list of strings
    
    # Translation table for removing punctuations (can also be done using regex)
    ## https://stackoverflow.com/questions/34293875
    #translator = str.maketrans('', '', string.punctuation)
    #tokens = [token.translate(translator) for token in doc]
    
    # Use regex instead (see remove_punct)
    tokens = [re.sub(r'['+string.punctuation+']', r'', token) for token in doc]
    
    # Remove the empty tokens after punctuation has been removed
    return remove_empty_tokens(tokens)

In [13]:
# A better way to remove punctuation
def remove_punct(doc, extent='full'):
    # doc is a tokenized list of strings
    # extent = 'full' -- use the full punctuation list
    # extent = 'select' -- use select punct_list below
    
    # string.punctuation consists of '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    # tailor to a more specific list of characters
    punct_list = '!"()\',.:;?<>[]$`{}'
    
    if extent == 'full': 
        # Use to remove the full list of punctuation characters
        tokens = [''.join(c for c in token if c not in string.punctuation) for token in doc]
    elif extent == 'select':
        # Use to remove a subset of punctuation characters
        tokens = [''.join(c for c in token if c not in punct_list) for token in doc]
    else:
        # default to the full list
        # Use to remove the full list of punctuation characters
        tokens = [''.join(c for c in token if c not in string.punctuation) for token in doc]
    
    # remove the empty tokens
    return remove_empty_tokens(tokens)

In [14]:
# Remove hex sequences
def remove_hex(doc): 
    # doc is a tokenized list of strings
    tokens = [re.sub(r'[^\x00-\x7F]', r'', token) for token in doc]
    # Remove the empty tokens after hex sequences have been removed
    return remove_empty_tokens(tokens)

In [15]:
# Remove stopwords
def remove_stopwords(doc):
    # doc is any tokenized list of strings 
    ## e.g., ['the', 'little', 'brown', 'fox', 'jumped', 'over']
    
    stop_words = set(stopwords.words('english'))
    return [token for token in doc if not token in stop_words]

In [16]:
# Remove additional words
def remove_words(doc, word_list=[]):
    # doc is any tokenized list of strings
    # word_list is the list of words to be removed from the doc 
    return [token for token in doc if not token in word_list]

In [17]:
# Remove sequences of numerals that occur as separate tokens
## This will remove '1234' but not 'dec1234'
def remove_numbers(doc):
    # doc is any list of tokenized strings
    tokens = [re.sub(r'\b\d+\b', r'', token) for token in doc]
    # Remove the empty tokens after standalone numerical sequences have been removed
    return remove_empty_tokens(tokens)

In [18]:
#### TO DO - DOESN'T WORK YET ####
# Not sure how to pass the raw regex string pattern to the function
# Remove a select sequence of characters
def remove_custom_token(doc, regex_pattern):
    # doc is any list of tokenized strings
    # regex_pattern is a token pattern to search for, e.g., '\b\d+\b' or '[^\x00-\x7F]'
    ## this string has to be converted into a raw string
    ## don't know how to do this yet...
    tokens = [re.sub(regex_pattern, r'', token) for token in doc]
    # Remove the empty tokens after standalone numerical sequences have been removed
    return remove_empty_tokens(tokens)

In [19]:
# Lemmatize the tokens
def lemmatize(doc):
    # doc is any list of tokenized strings
    return [wordnet_lemmatizer.lemmatize(token) for token in doc]

In [20]:
# Use the functions above in sequence to tokenize and clean a document
def clean_doc(text, rem_punct=1, rem_hex=1, rem_nums=1, rem_stopwords=1):
    # text is a string of text
    # Clean each document as needed
    tokens = tokenize_text(text)
    tokens = lower_case(tokens)
    
    if rem_punct == 1:
        tokens = remove_punct(tokens, extent='full')
    if rem_hex == 1:
        tokens = remove_hex(tokens)
    if rem_nums == 1:
        tokens = remove_numbers(tokens)
    if rem_stopwords == 1: 
        tokens = remove_stopwords(tokens)
    
    return tokens

## Functions for building n-grams

In [21]:
# Build n-grams models by training the Gensim phraser on the corpus
## 2-grams model is built by providing the 1-gram corpus as input
## 3-grams model is built by providing the 2-gram corpus as input
## and so on...

def build_ngram_model(text_corpus_ngram, ngram_size, file_path):
    # text_corpus_ngram is list of documents where each document is a list of tokens
    # ngram_size is the n of the n-gram - used to give the output the right name
    # file_path = intermediate_dir_path
    
    # Train a model to recognize n-word phrases on the corpus
    t0 = time.time()
    model_ngram = Phrases(text_corpus_ngram, min_count=1, threshold=1)
    t1 = time.time()
    print(str(ngram_size) + "-gram phraser model trained in {} seconds.".format(round(t1-t0, 1)))
    
    # Create a more efficient model for future use
    model_ngram_fast = Phraser(model_ngram)
    
    # Save the phraser model_ngram_fast for future use
    path_to_file = os.path.join(file_path, 'phraser_ngram_' + str(ngram_size))
    model_ngram_fast.save(path_to_file)
    t2 = time.time()
    
    print("Efficient N-gram phraser model created and saved to {} in {} seconds.".format(path_to_file, round(t2-t1, 1)))
    print("Load the efficient phraser model using models.phrases.Phraser.load(" + path_to_file +")")
    
    return model_ngram_fast

In [22]:
# For any document in the text corpus, apply an n-gram model 
## to get the n-gram version of that document
def apply_ngram_model(n_gram_model, document, ngram_size):
    t0 = time.time()
    n_gram_doc = n_gram_model[document]
    t1 = time.time()
    #print("Created {}-gram document in {} seconds.".format(ngram_size, round(t1-t0, 1)))
    return n_gram_doc

In [23]:
# Build and apply the Gensim n-gram model to the text_corpus
## 2-grams model is built by providing the 1-gram text corpus as input
## 3-grams model is built by providing the 2-gram text corpus as input
## and so on...
def gensim_n_gram_corpus(text_corpus_n_gram, n_gram_size, file_path, file_name):
    # Create the model
    model_ngram_fast = build_ngram_model(text_corpus_n_gram, n_gram_size, file_path)
    
    t0 = time.time()
    # Use the model to generate n-grams for each document in the corpus
    text_corpus_n_grams = [apply_ngram_model(model_ngram_fast, doc, n_gram_size) 
                           for doc in text_corpus_n_gram]
    t1 = time.time()
    print("Created Gensim " + str(n_gram_size) + "-gram corpus in {} seconds.".format(round(t1-t0, 2)))
    
    # Save the n-gram corpus
    # Pickle the text corpus for next step
    pickle_list(os.path.join(file_path, file_name), text_corpus_n_grams)
    print("Saved the " + str(n_gram_size) + "-gram text corpus to {}".format(os.path.join(file_path, file_name)))
    
    return text_corpus_n_grams

In [24]:
# For a given corpus, create the Gensim dictionary
def create_dictionary(corpus_text, ngram_size, save_location):
    # corpus_text is an n-gram corpus for n >= 1
    # ngram_size is the size of the ngrams in the corpus
    # save_location is, e.g., os.path.join(intermediate_dir_path, 'dictionary_ngram_size.dict')
    
    #### Dictionary filtering parameters ####
    # Words that appear in less than or equal to NO_BELOW documents in the corpus
    NO_BELOW = 1
    # Words that appear in more than NO_ABOVE percentage of the documents in the corpus
    NO_ABOVE = 0.99
    
    t0 = time.time()
    print("Starting to create dictionary...")
    dic = Dictionary(corpus_text)
    t1 = time.time()
    print("N-gram size " + str(ngram_size) + " Dictionary created in {} seconds...".format(round(t1-t0, 1)))
    print("The N-gram size {} dictionary is {:,} words long.".format(ngram_size, len(dic)))
    
    # Remove words that are very rare or too common from the dictionary
    dic.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE)
    print("The filtered N-gram size {} dictionary is {:,} words long.".format(ngram_size, len(dic)))
    
    # Reassign integer ids (compactify) the filtered dictionary
    t2 = time.time()
    dic.compactify()
    # Save the dictionary for future use
    dic.save(save_location)
    t3 = time.time()
    print("N-gram size " + str(ngram_size) + " Dictionary compacted and saved in {} seconds...".format(round(t3-t2, 1)))
    
    # Load the dictionary using
    ## dictionary = corpora.Dictionary.load(path_to_dictionary_file)
    print("Load the dictionary using corpora.Dictionary.load(path_to_file)")
    
    return dic

In [25]:
# For a given corpus, use the dictionary to create a bag of words
def create_bag_of_words(corpus_text, dictionary, ngram_size, save_location):
    # corpus_text is an n-gram corpus for n >= 1
    # dictionary is the matching n-gram dictionary
    # ngram_size is the size of the ngrams in the corpus
    # save_location is os.path.join(intermediate_dir_path, 'bow_ngram_size.mm')
    
    print("Starting to create the Bag of Words...")
    t0 = time.time()
    bow = [dictionary.doc2bow(item) for item in corpus_text]
    t1 = time.time()
    print("N-grams size {} bag of words created in {} seconds.".format(ngram_size, round(t1-t0, 1)))
    
    # Save the BoW corpus for later use
    t2 = time.time()
    corpora.MmCorpus.serialize(save_location, bow)
    t3 = time.time()
    print("N-grams size {} bag of words saved in {} seconds.".format(ngram_size, round(t3-t2, 1)))
    print("Load the BoW corpus using - corpora.MmCorpus(full_path_to_BoW_corpus_name)")

    # Load the BoW corpus using
    ## corpus_name = corpora.MmCorpus(full_path_to_corpus_name)
    
    return bow

In [26]:
# N-gram generation
## Another way to do n-grams without relying on Gensim's Phraser
# Generates a list of strings, each string consisting of n words
# From https://programminghistorian.org/lessons/keywords-in-context-using-n-grams#from-text-to-n-grams
def getNGrams(wordlist, n, join_charac = '_'):
    # wordlist is the output of prep_doc or any list of strings
    # it must be a tokenized list, e.g., ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'hen']
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        ngrams.append(wordlist[i:i+n])
        
    # Join the words in each list
    ngrams_joined = [join_charac.join(list) for list in ngrams]
    
    return ngrams_joined

In [27]:
# N-gram generation -- NLTK style
# Modified from https://programminghistorian.org/lessons/keywords-in-context-using-n-grams#from-text-to-n-grams
# Generates a list of strings, each string consisting of 1, 2, ..., n words
# This is the way NLTK generates n-grams
def getNGramsFull(wordlist, n, join_charac = '_'):
    # wordlist is the output of prep_doc or any list of strings
    # it must be a tokenized list, e.g., ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'hen']
    ngrams = []
    for i in range(len(wordlist)-(n-1)):
        for j in range(n):
            ngrams.append(wordlist[i:i+j+1])
        
    # Join the words in each list
    ngrams_joined = [join_charac.join(list) for list in ngrams]
    
    return ngrams_joined

In [28]:
# Modified from From http://www.albertauyeung.com/post/generating-ngrams-python/
def generate_ngrams(doc, n, join_charac = '_'):
    # doc is a document in a text corpus
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[doc[i:] for i in range(n)])
    return [join_charac.join(ngram) for ngram in ngrams]

In [29]:
# Generate simple n-grams and pickle the list for later use
def gen_pkl_ngrams(text_corpus, size, pkl_file_path_plus_name):
    text_corpus_n_grams = [generate_ngrams(doc, size) for doc in text_corpus]
    # Pickle the list for later use
    # Pickle the text corpus for next step
    pickle_list(pkl_file_path_plus_name, text_corpus_n_grams)

In [30]:
# Generating n-grams at the character level
## Modified from https://bergvca.github.io/2017/10/14/super-fast-string-matching.html
def char_ngrams(string, size=3):
    #string = re.sub(r'[,-./]|',r'', string)
    string = " ".join(clean_doc(string))
    ngrams = zip(*[string[i:] for i in range(size)])
    return [''.join(ngram) for ngram in ngrams]

In [31]:
#### DEPRECATED ####
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# import string

# # Translation table for removing punctuations
# table = str.maketrans('', '', string.punctuation)

# #### Stop words ####
# STOP_WORDS_0 = []
# #STOP_WORDS_1 = set(stopwords.words('english'))
# #### SET STOP_WORDS HERE ####
# STOP_WORDS = STOP_WORDS_0

# # Remove specific words that occur in the corpus
# REMOVE_WORDS_0 = [] # Use when you don't need to remove anything specific
# # Identified after an initial analysis of the corpus
# REMOVE_WORDS_1 = ['bc', 'at', 'asking', 'we', 'hoping', 'meeting', 'understand', 'inquiry', 
#                 'could', 'need', 'request', 'looking', 'v', 'u', 'etc', 'client', 'would', 
#                 'you', 'like', 'speak', 'schedule', 'call', 'analyst', 'discus', 'me', 'hi', 
#                 'hello', 'follow', 'up', 'set', 'question', 'thought', 'please', 'thank',
#                ]

# #### SET REMOVE_WORDS HERE ####
# REMOVE_WORDS = REMOVE_WORDS_0

# #### SET LEMMATIZE OFF/ON HERE ####
# LEMMATIZE = 1 # Set to 1 to turn on

# wordnet_lemmatizer = WordNetLemmatizer()

# def prep_doc(document, stop_words=STOP_WORDS_0, remove_words=REMOVE_WORDS_0, lemmatize=0):
#     '''
#     Following https://machinelearningmastery.com/clean-text-machine-learning-python/
    
#     1. Tokenize the entire document on whitespace.
#     2. Remove punctuation.
#     3. Normalize case.
#     4. Remove stopwords.
#     5. Lemmatize
#     6. Clean up the remaining items -- non-ASCII characters, empty strings, specific words, numbers
#     '''
#     # Function defaults are set conservatively - just remove English stop words
    
#     # Tokenize
#     #tokens = nltk.word_tokenize(document)
#     # Handle NULL documents
#     try:
#         tokens = document.split()
#     except AttributeError:
#         return ['EMPTY']
    
#     # Strip all punctuations
#     stripped = [token.translate(table) for token in tokens]
    
#     # Normalize case
#     normalized = [strip.lower() for strip in stripped]
    
#     # Remove stopwords
#     stopped = [norm for norm in normalized if not norm in stop_words]
    
#     # Lemmatize
#     if lemmatize == 1:
#         lemmatized = [wordnet_lemmatizer.lemmatize(stop) for stop in stopped]
#     else:
#         lemmatized = stopped
    
#     # Remove non-ASCII tokens (e.g., '\x96')
#     asciied = [re.sub(r'[^\x00-\x7F]', r'', lem) for lem in lemmatized]
    
#     # Remove empty tokens ''
#     # Empty strings have truth value FALSE; hence non-empty strings are TRUE
#     # https://stackoverflow.com/questions/9573244/most-elegant-way-to-check-if-the-string-is-empty-in-python
#     misc = [asc for asc in asciied if asc]
    
#     # remove strings that are numerals
#     cleaned = [mis for mis in misc if mis.isdigit() == False]
    
#     final = [clean for clean in cleaned if not clean in remove_words]
    
#     return final

## Simple text functions

In [32]:
# Frequency counts of words in a corpus from the top of the list to a specific point
## or from any specific point in the list all the way to the bottom of the list
# Calculate the frequency of occurrence of words
from collections import defaultdict
import operator

def word_freq(token_list, num_results=30, list_type='corpus'):
    # token_list can be a corpus (e.g., the output of build_corpus), or
    ## it can be a simple list of tokens.
    # list_type = 'corpus' (default) or 'simple list' (this is the type of list in token_list)
    
    # If token_list is a corpus, 
    ## flatten the lists in corpus_text into one big list of words in the entire corpus
    if list_type == 'corpus':
        word_list = flatten_list(token_list)
    elif list_type == 'simple list':
        word_list = token_list
    else:
        return 'Sorry, the list has to be a simple list or a corpus. Try again.'
        
    frequency = defaultdict(int)
    for word in word_list:
        frequency[word] += 1
    
    if num_results > 0:
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)[0:num_results]
    elif num_results < 0:
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)[num_results:]
    else:
        # return the frequencies for the entire vocabulary when num_results is set to 0
        return sorted(frequency.items(), key=operator.itemgetter(1), reverse=True)

In [33]:
# Return words that occur between 2 frequency values
def word_freq_between(token_list, greater_than=200, less_than=500, list_type='corpus'):
    
    # Get the entire vocabulary by frequency
    item_freq = word_freq(token_list, 0, list_type)
    
    return [(word[0], word[1]) for word in item_freq if (list(word)[1] >= greater_than) and (list(word)[1] < less_than)]

## Search functions that don't require any NLP model

In [34]:
# Very simple search function
## Given a document and a word in that document, find all occurrences of the word
## Then return the word along with its n neighbors on either side of it

# If the corpus is small, then then it can be turned into a single document -- i.e.,
## a single list of words.

def get_word_context(document, word, context_type='full', context_window=3, include_word=False):
    # find the indices of each occurrence of the word in the document
    # document is a list of tokens, e.g., ['The', 'grey', 'fox', 'jumped', 'over', 'the', 'blue', hen']
    # expression is the expression we're interested in; we'd like to know 
    ## the context in which it appears in the corpus
    # context_type can be 'full' (default) for words surrounding the given word, 
    ## 'left' for words to the left of the given word, or 'right' for words to the right of the given word
    # context_widow is the number of tokens to the left and right of the given word that are pulled
    
    doc_length = len(document)
    
    # Find the position of the word in the document 
    # It is returned as a list with a single element -- e.g., [4]
    index = [i for i, x in enumerate(document) if x == word]
    
    context_full = []
    context_left = [] # words to the left of the given word
    context_right = [] # words to the right of the given word
    
    # get the word in context
    # make sure the context window makes sense given the length of the document 
    ## and the position of the word in question
    for ind in index:
        if ind - context_window < 0:
            left_index = 0
        else:
            left_index = ind - context_window
        
        if ind + context_window + 1 > doc_length:
            right_index = doc_length
        else:
            right_index = ind + context_window + 1
        
        c_full = document[left_index:right_index]
        c_left = document[left_index:ind]
        c_right = document[ind + 1:right_index]
        
        if include_word:
            context_full.append(c_full)
        else:
            context_full.append([x for x in c_full if x != word])
            
        context_left.append(c_left)
        context_right.append(c_right)
        
    if context_type == 'full':
        #print("Number of matches = {}".format(len(context_full)))
        return context_full
    elif context_type == 'left':
        #print("Number of matches = {}".format(len(context_left)))
        return context_left
    elif context_type == 'right':
        #print("Number of matches = {}".format(len(context_right)))
        return context_right
    else:
        return context_full # default is all of the words, both to the left and right of the given word

In [35]:
# Get the context for each document in the corpus for a given word
def get_word_context_corpus(corpus, word, context_type='full', context_window=3, include_word=True):
    
    # corpus is a list of of documents -- i.e., a list of lists
    
    # Get the raw context
    context_array = [get_word_context(item, word, context_type, context_window, include_word) for item in corpus]
    
    # Add an index number to each result
    context_array = list(zip(range(len(corpus)), context_array))
    
    # Clean up the context
    ## Remove all empty lists
    context_array = [x for x in context_array if isListEmpty(x[1]) == False]
    
    num_notes_matched = len(context_array)
    
    match_lengths=[]
    for array in context_array:
        array_len = len(array)
        match_lengths.append(array_len)
    
    num_total_matches = np.sum(match_lengths)
    
    # Return the context as a list of lists
    #return context_comb_again
    print("Number of call note matches = {}".format(num_notes_matched))
    print("Total number of matches = {}".format(num_total_matches))
    return pd.DataFrame(context_array, columns=['Note Number', 'Context'])

In [36]:
# Very simple search function to get word context across the entire corpus
# This is for a corpus that's built out of individual files
def get_word_context_fileList(corpus, file_list, word, window=6):
    
    file_info = []
    doc_info = []
    
    doc_index = 0 # keeps track of the file associated with the document
    for document in corpus:
        doc_name = file_list[doc_index]
        context = get_word_context(document, word, window)
        # if the context is not an empty list, append it to doc_info
        if context:
            file_info.append(doc_name)
            context_strings = []
            for item in context:
                context_strings.append(" ".join(item))
            doc_info.append(context_strings)
        doc_index += 1
        
    # Put it into a dataframe for display
    df_results = pd.DataFrame({'Call Note': file_info, 'Matching Phrases': doc_info})
    
    return df_results

In [37]:
# Get all the rows of the dataframe that match the given key word or phrase 
## which can be a unigram, bigram, or trigram
def get_keyword_matches(dataFrameFull, field_name, key_word):
    # dataFrameFull is a dataframe that contains the unigram, bigram, and trigram tokens as columns -- 
    ## e.g., df_data_full
    # field_name is the name of the column for particular kind of phrase that must be matched, 
    ## e.g., 'MCP Bigram Names' in df_data_full
    
    rows = []
    t0 = time.time()
    for i in range(len(dataFrameFull)):
        if key_word in dataFrameFull[field_name][i]:
            rows.append(dataFrameFull.iloc[i])
            
    t1 = time.time()
    print("Found {} matches in {} seconds".format(len(rows),round(t1-t0)))
    
    return pd.DataFrame(rows)

In [38]:
def bag_of_words_match(text_to_check, search_phrase):
    # Checks if all the tokens of search_phrase occur in text_to_check
    # text_to_check is a clean tokenized list -- e.g., a document in a cleaned corpus
    # only makes sense to use unigram corpora
    # search_phrase can be any string, e.g., 'the brown Fox, jumped Over, the hen'
    # Both text_to_check and search_phrase are considered bags of words -- 
    ## the order of the words don't matter. 
    ## This approach works well when documents lengths are small. 
    
    # clean and tokenize the search_phrase
    search_phrase_clean = clean_doc(search_phrase)
    
    # Check the length of the intersection between text_to_check and search_phrase_clean
    intersection_cardinality = len(set.intersection(*[set(text_to_check), set(search_phrase_clean)]))
    if intersection_cardinality == len(search_phrase_clean):
        return text_to_check
    else:
        return "NO MATCH"

In [39]:
# Get all the rows of the dataframe that match the given key word or phrase
## Only makes sense to use the unigram corpus 
def get_corpus_phrase_matches(corpus, search_phrase):
    # corpus is a cleaned unigram text corpus
    # search phrase can be any string, e.g., 'the brown Fox, jumped Over, the hen'
    t0 = time.time()
    # Use bag_of_words_match on each document in the corpus
    match_array = [bag_of_words_match(item, search_phrase) for item in corpus]
    
    # Add an index number to each result
    match_results = list(zip(range(len(corpus)), match_array))
    
    # Remove all "NO MATCH" items
    matches = [x for x in match_results if x[1] != "NO MATCH"]
            
    t1 = time.time()
    print("Phrase matched {} times in {} seconds".format(len(matches), round(t1-t0, 2)))
    return pd.DataFrame(matches, columns=['Note Number', 'Content'])

## Create and Load Search Indices for TF-IDF Models

In [40]:
#### DISABLE FOR NOW - GENSIM INSTALL ISSUE ####
# Index a bow corpus using a TFIDF model
def create_search_index(model, index_path_and_name, bow, dictionary):
    t0 = time.time()
    # transform corpus to tfidf space and index it
    index = similarities.Similarity(index_path_and_name, 
                                    model[bow], 
                                    num_features=len(dictionary)
                                   )
    t1 = time.time()
    print("Index created in {} seconds.".format(round(t1-t0, 2)))
    
    return index

In [41]:
#### DISABLE FOR NOW - GENSIM INSTALL ISSUE ####
def load_search_index(index_path_and_name):
    t0 = time.time()
    index = similarities.Similarity.load(index_path_and_name)
    t1 = time.time()
    print("Index loaded in {} seconds.".format(round(t1-t0, 2)))
    return index

## Search functions on pre-built NLP models

In [42]:
def prep_input_string(input_string, phraser):
    # input_string is any input of the form "this is  a string"
    clean_string = clean_doc(input_string)
    phrased_string = phraser[clean_string]
    return phrased_string

In [43]:
# Return the results of a query
def get_query_results(query_string, 
                      search_dict, 
                      df_corpus, 
                      df_cols_to_display, 
                      num_results=25
                     ):
    
    '''
    query_string is a string of any length
    search_dict is a dict that contains the name of the phraser, dictionary,
      model, and index to use for the search.
    df_corpus is the complete dataframe of the corpus being searched  
    df_cols_to_display are the cols of df_corpus to display in the search results dataframe
    
    '''
    t0 = time.time()
    # Process the query string into a list of tokens
    #clean_query = prep_doc(query_string)
    clean_query = clean_doc(query_string)
    
    # Convert the list of tokens into phrases if necessary
    if search_dict['phraser'] != '':
        phrased_query = search_dict['phraser'][clean_query]
    else:
        phrased_query = clean_query
    
    # For everything EXCEPT Doc2Vec proceed as follows
    if search_dict['index'] != 'doc2vec':
        # Use the dictionary to transform the phrased_query into a bag of words vector
        bow_query = search_dict['dictionary'].doc2bow(phrased_query)
    
        # Transform the bag of words vector into a vector in the topic model's space
        model_query = search_dict['model'][bow_query]
    
        # Calculate the similarity of the query to each document in the corpus
        sims = search_dict['index'][model_query]
    
        # Sort the similarity scores in descending order
        sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])[0:num_results]
    else:
        # Create the Doc2Vec sims on the fly
        query_vector = search_dict['model'].infer_vector(phrased_query)
        sims_sorted = d2v_trigram_100.docvecs.most_similar(positive=[query_vector], topn=num_results)
    
    # Build a dataframe for displaying the search results
    dataFrame_content = []
    for item in sims_sorted:
        dataFrame_content.append(df_corpus.iloc[item[0]][df_cols_to_display].values)
        
    df_results = pd.DataFrame.from_records(dataFrame_content, columns=df_cols_to_display)
    
    t1 = time.time()
    print("Search results obtained in {:.3} seconds.".format(t1-t0))
    print("The query is: {}".format(query_string))
    print("Here are the top {} results...".format(num_results))
    
    return df_results

In [44]:
# Jaccard similarity is (among other things) a metric for measuring how well a model matches an observation to an 
# existing observation.
## The essence of Jaccard Similarity is to find, for any two sets, the number of elements in the intersection divided by the number of elements 
## in the union of the sets.
# Slightly modified from 
#    http://dataconomy.com/2015/04/implementing-the-five-most-popular-similarity-measures-in-python/
def jaccard_similarity(x,y):
    # x and y are tokenized sentences
    #print(set(x))
    #print(set(y))
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    
    try:
        jac_score = intersection_cardinality/float(union_cardinality)
    except ZeroDivisionError:
        jac_score = 0.
 
    return jac_score

In [45]:
## Extend the jaccard_similarity function above to any number of inputs
## which are specified as a list of lists.

# Jaccard similarity is (among other things) a metric for measuring how well a model matches an observation to an 
# existing observation. This extends jaccard_similarity above to any number of inputs.
## The essence of Jaccard Similarity is to find, for any two sets, the number of elements in the intersection divided by the number of elements 
## in the union of the sets.
## From https://stackoverflow.com/questions/2541752/best-way-to-find-the-intersection-of-multiple-sets
def jaccard_similarity_general(list_of_lists):
    # Convert the lists into sets
    set_list = [set(item) for item in list_of_lists]
    intersection_cardinality = len(set.intersection(*set_list))
    union_cardinality = len(set.union(*set_list))
    
    try:
        jac_score = intersection_cardinality/float(union_cardinality)
    except ZeroDivisionError:
        jac_score = 0.
 
    return jac_score

In [46]:
# Measure the jaccard similarity between the search results returned for any query
## This is a measure of the variation in the search results that are output for any single query string
def intra_search_overlap(df_search_output, plot_title=''): 
    '''
    Measure the jaccard similarity between the search results returned for any query.
    This is a measure of the variation in the search results that are output for any single query string.
    Display the intra-search result similarity as a heatmap.
    
    df_search_output is the result of a search which returns ONLY the 'CLIENT_QUESTION_PROCESSED' column.
    '''
    
    j_scores_intra = []
    for i in range(len(df_search_output)):
        j_score_row = []
        for j in range(len(df_search_output)):
            j_score = jaccard_similarity(df_search_output.iloc[i].values[0], 
                                         df_search_output.iloc[j].values[0]
                                        )
            j_score_row.append(j_score)
    
        j_scores_intra.append(j_score_row)

    # Put the jaccard scores in a dataframe for display using Seaborn
    #df_display = pd.DataFrame(j_scores_intra, columns=list(range(0,len(df_search_output))))
    grid_titles = ["Search Result " + str(i) for i in range(len(df_search_output))]
    df_display = pd.DataFrame(j_scores_intra, columns=grid_titles)
    
    # Create the heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    ax.set_title(plot_title)
    
    sns.heatmap(df_display, 
                cmap="YlGnBu", 
                annot=True, 
                annot_kws={"size": 9}, 
                linewidths=2, 
                linecolor='yellow', 
                yticklabels=grid_titles
               )

In [47]:
def inter_search_overlap(df_search_ouputs, 
                         column_name,
                         grid_titles, 
                         plot_title='Inter-Search Overlap of Core Topics'
                        ):
    '''
    
    '''
    # Join the tokens returned by each search result into a single big list of tokens for that search query
    search_output_tokens = []
    for df_out in df_search_outputs:
        ab = itertools.chain(df_out[column_name].values)
        flat_ab = [item for sublist in list(ab) for item in sublist]
        search_output_tokens.append(flat_ab)
        
    # for each pair of lists in search_output_tokens, get the jaccard distance
    j_scores_inter = []
    for i in range(len(search_output_tokens)):
        j_score_row = []
        for j in range(len(search_output_tokens)):
            j_score = jaccard_similarity(search_output_tokens[i], search_output_tokens[j])
            j_score_row.append(j_score)
    
        j_scores_inter.append(j_score_row)

    # Put the jaccard scores in a dataframe for display using Seaborn
    #df_display = pd.DataFrame(j_scores_inter, columns=list(range(0,len(search_output_tokens))))
    df_display = pd.DataFrame(j_scores_inter, columns=grid_titles)
    
    #return df_display
    # Create the heatmap
    fig, ax = plt.subplots(figsize=(12, 8))
    ax.set_title(plot_title)
    sns.heatmap(df_display, 
                cmap="BuPu", 
                annot=True, 
                annot_kws={"size": 9}, 
                linewidths=2, 
                linecolor='yellow', 
                yticklabels=grid_titles) 

## Create Word2Vec Models and Functions to Explore Them

In [48]:
# This creates a gensim word2vec vectors object. 
## To access it and see the vectors, use the function display_vectors below
def create_word2vec_model(n_gram_corpus, vector_size=100, context_window=5, alg=0, num_iter=30):
    # Set workers = num of CPU cores on the machine
    # alg = 1 uses SkipGram algorithm; alg = 0 uses CBOW algorithm
    # iter sets the number of training epochs
    t0 = time.time()
    print("Creating the Word2Vec model with vector size {}".format(vector_size))
    print("This takes time...patience...")
    w2v_model = Word2Vec(n_gram_corpus, 
                         size=vector_size, 
                         window=context_window, 
                         min_count=20, 
                         sg=alg, 
                         workers=4, 
                         iter=num_iter
                        )
    t1 = time.time()
    print("Word2Vec model with {} training eopchs was created in {} seconds.".format(num_iter, round(t1-t0, 3)))
    
    # Use the model's KeyedVectors to reduce memory and delete the model
    w2v_model_keyed_vecs = w2v_model.wv
    del w2v_model
    
    # Number of words in the vocabulary
    print("{} terms in the Word2Vec model's vocabulary.".format(len(w2v_model_vecs.vocab)))
    
    return w2v_model_keyed_vecs

In [49]:
# Save the Word2Vec model's keyed vectors
def save_word2vec_model(model_keyed_vectors, file_name):
    # model_keyed_vectors are saved from the output of create_word2vec_model above
    t0 = time.time()
    destination = os.path.join(intermediate_dir_path, file_name)
    model_keyed_vectors.save(destination)
    t1 = time.time()
    print("Model save to {} in {} seconds".format(destination, round(t1-t0, 3)))

In [50]:
# Load the Word2Vec model keyed vectors
def load_word2vec_model(file_name):
    # model_keyed_vectors are returned from create_word2vec_model above
    t0 = time.time()
    destination = os.path.join(intermediate_dir_path, file_name)
    model_keyed_vectors = KeyedVectors.load(destination, mmap='r')
    t1 = time.time()
    print("Model loaded from {} in {} seconds".format(destination, round(t1-t0, 3)))
    return model_keyed_vectors

In [51]:
# Display the entire table of vector embedding values
def display_vectors(w2v_KeyedVecs):
    
    #### NOTE: the full model is not used - only the model's KeyedVectors ####
    # build a list of the terms, integer indices,
    # and term counts from the given Word2Vec model vocabulary
    ordered_vocab = [(term, voc.index, voc.count) for term, voc in w2v_KeyedVecs.vocab.items()]

    # sort by the term counts, so the most common terms appear first
    ordered_vocab = sorted(ordered_vocab, key=lambda item: -item[2])

    # unzip the terms, integer indices, and counts into separate lists
    ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

    # create a DataFrame with the food2vec vectors as data,
    # and the terms as row labels
    word_vectors = w2v_KeyedVecs.vectors[term_indices, :]
    
    # create a dataframe for displaying the vectors
    df_display = pd.DataFrame(word_vectors, index=ordered_terms)
    
    return df_display

In [52]:
# Based on Patrick Harrison and Radim Rahurek

def pos_related_terms(w2v_KeyedVec, token, topn=20):
    
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """
    try:
        for word, similarity in w2v_KeyedVec.most_similar(positive=[token], topn=topn):
            print(u'{:20} {}'.format(word, round(similarity, 3)))
    except KeyError:
        print("Sorry, try a different term")
        
def neg_related_terms(w2v_KeyedVec, token, topn=20):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """
    try:
        for word, similarity in w2v_KeyedVec.most_similar(negative=[token], topn=topn):
            print(u'{:20} {}'.format(word, round(similarity, 3)))
    except KeyError:
        print("Sorry, try a different term")
        

def word_algebra(w2v_KeyedVec, add_string, subtract_string, topn=20):
    """
    combine the vectors associated with the words provided
    in add_string and subtract_string, look up the topn most similar
    terms to the combined vector, and print the result(s)
    Use add_string=None or '' or subtract=None or '' to leave the fields empty
    """
    # Prep the strings
    if add_string != None:
        add = clean_doc(add_string)
    else:
        add = add_string
        
    if subtract_string != None:
        subtract = clean_doc(subtract_string)
    else:
        subtract = subtract_string
    
    try:
        answers = w2v_KeyedVec.most_similar(positive=add, negative=subtract, topn=topn)
        for term, similarity in answers:
            print(term)
    except KeyError:
        print("Sorry, one or more terms is not in the vocabulary - please try different terms.")
    
        
def odd_one_out(w2v_KeyedVec, token_string):
    
    token_list = clean_doc(token_string)
    
    try:
        odd_one = w2v_KeyedVec.doesnt_match(token_list)
    except ValueError:
        odd_one = "Sorry, one or more terms is not in the vocabulary - please try different terms."
    
    return odd_one

## Create Doc2Vec Models

### Pre-Trained Doc2Vec Model

In [53]:
# Load a pretrained doc2vec model given the file path
import gensim.models as gm
#### NOTE: The only pre-trained doc2vec model we have is 'doc2vec_pretrained'
#### which points to 'Repos/Gartner/Doc2Vec-Pretrained-Vectors/doc2vec.bin'
def load_doc2vec_pre_model(pre_trained_model_file):
    # load the model
    t0 = time.time()
    doc2vec_model_pre = gm.Doc2Vec.load(pre_trained_model_file)
    t1 = time.time()
    print("Pre-trained Doc2Vec model {} loaded in {} seconds".format(pre_trained_model_file, round(t1-t0, 2)))
    
    return doc2vec_model_pre

In [54]:
# Create document vectors given a text corpus and a pre-trained doc2vec model
def create_doc_vectors_pre(text_corpus, 
                           doc2vec_model_pre, 
                           start_alpha=0.001, 
                           infer_epoch=1000):
    
    # Model Hyper Parameters 
    # The lower the start_alpha and the higher the infer_epoch the longer
    ## it takes to get the vectors and the more refined the vectors become
    
    doc2vec_vectors_pre = []
    t0 = time.time()
    print("Starting to vectorize corpus ... this can take a while ...")
    print("Using hyperparameters start_alpha = {} and infer_epoch = {}".format(start_alpha, infer_epoch))
    for d in text_corpus:
        doc_vector = [str(x) for x in doc2vec_model_pre.infer_vector(d, 
                                                                     alpha=start_alpha, 
                                                                     steps=infer_epoch)]
        doc_vector = [float(val) for val in doc_vector]
        doc2vec_vectors_pre.append(doc_vector)
    t1 = time.time()
    print("Corpus vectors created using the pre-trained doc2vec model in {} seconds".format(round(t1-t0, 2)))
    print("Each document vector has length {}".format(len(doc_vector)))
    
    return doc2vec_vectors_pre
    

In [55]:
# Create a doc2vec model on a given text corpus
def create_doc2vec_model(text_corpus, 
                         vector_size, 
                         window_size=2, 
                         min_count=20, 
                         workers=4, 
                         epochs=500, 
                         tagged=0
                        ):
    # When tagged=1, just return the tagged text corpus
    
    # create the tagged corpus
    print("Creating a tagged text corpus...")
    t0 = time.time()
    text_corpus_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(text_corpus)]
    t1 = time.time()
    print("Tagged text corpus created in {} seconds".format(round(t1-t0, 2)))
    
    if tagged==1:
        return text_corpus_tagged
    
    # Train the model
    # Set workers = num of CPU cores on the machine
    # skipgram or DBOW doesn't figure in this model
    # epochs sets the number of training epochs
    print("vector size = {}".format(vector_size))
    print("window size = {}".format(window_size))
    print("min count = {}".format(min_count))
    print("num of workers = {}".format(workers))
    print("num of training epochs = {}".format(epochs))
    print("Starting to create a trained Doc2Vec model...this takes time...")
    t2 = time.time()
    d2v_model = Doc2Vec(text_corpus_tagged, 
                        vector_size=vector_size, 
                        window=window_size, 
                        min_count=min_count, 
                        workers=workers, 
                        epochs=epochs
                       )
    t3 = time.time()
    print("Doc2Vec model created {} seconds".format(round(t3-t2, 2)))
    
    return d2v_model
    

In [56]:
# Save a doc2vec model later use
def save_doc2vec_model(model, destination):
    model.save(destination)
    # delete the temp training data
    model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
    print("Doc2Vec model {} saved to {}".format(model, destination))
    print("Load the model using load_doc2vec_model(destination)")

In [57]:
# Load a doc2vec model from its saved destination
def load_doc2vec_model(destination):
    t0 = time.time()
    doc2vec_model = Doc2Vec.load(destination)
    t1 = time.time()
    print("Doc2Vec model from {} loaded in {} seconds".format(destination, round(t1-t0, 2)))
    
    return doc2vec_model

In [58]:
# Get the vectors of a doc2vec model
def doc2vec_get_vecs(doc2vec_n_grams_model):
    return doc2vec_n_grams_model.docvecs.vectors_docs

## Create LDA Models

In [59]:
def create_lda_model(bow_corpus,
                     num_topics,
                     dictionary,
                     workers=3,
                     chunksize=200, 
                     passes=10, 
                     eval_every=2, 
                     iterations=500,  
                     alpha='asymmetric', 
                     eta='auto'
                    ):
    
    t0 = time.time()
    print("Creating LDA model with {} topics...this takes time...patience".format(num_topics))
    lda_model = LdaMulticore(bow_corpus,
                             num_topics=num_topics,
                             id2word=dictionary,
                             workers=workers,
                             chunksize=chunksize, 
                             passes=passes, 
                             eval_every=eval_every, 
                             iterations=iterations,  
                             alpha=alpha,
                             eta=eta, 
                             per_word_topics=True
                            )
    
    t1=time.time()
    print("LDA model with {} topics created in {} seconds.".format(num_topics, round(t1-t0)))
    
    return lda_model

In [60]:
def save_lda_model(model, destination):
    # Save the model for later use
    model.save(destination)
    print("The model {} is saved to {}".format(model, destination))

In [61]:
def load_lda_model(destination):
    print("LDA model loaded from {}".format(destination))
    return models.LdaModel.load(destination)
    

In [62]:
# Calculate the coherence of the LDA model
def lda_model_coherence(lda_model, doc_corpus, dictionary, coherence="c_v"):
    t0 = time.time()
    print("Starting to calculate coherence score...this can take some time...")
    coherence = CoherenceModel(model=lda_model, texts=doc_corpus, dictionary=dictionary, coherence=coherence)
    coherence_score = coherence.get_coherence()
    t1 = time.time()
    print("Coherence score = {} calculated in {} seconds.".format(coherence_score, round(t1-t0,3)))
    return coherence_score

In [63]:
# Perplexity of a topic model
## The lower the perplexity, the better the model
## From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

def lda_model_perplexity(lda_model, bow_corpus):
    t0 = time.time()
    print("Starting to calculate coherence score...this can take some time...")
    perplexity_score = lda_model.log_perplexity(bow_corpus)
    t1 = time.time()
    print("Perplexity score = {} calculated in {} seconds.".format(perplexity_score, round(t1-t0,3)))
    return perplexity_score

## Find the best combination of hyperparameters for an LDA n-gram model

Inspired by https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [64]:
def tune_lda_model(n_gram_bow_corpus, n_gram_dictionary, n_gram_doc_corpus, num_combinations=2):
    # Hyperparameters of the LDA model
    num_topics = [5, 10, 15, 20]
    alpha = [0.01, 0.31, 0.61, 0.90, 'symmetric', 'asymmetric']
    eta = [0.01, 0.31, 0.61, 0.90, 'symmetric', 'auto']
    
    # Create m hyperparameter lists
    m = 10_000
    hyp_param_lists = [[random.choice(num_topics), random.choice(alpha), random.choice(eta)] for i in range(m)]
    
    # Try out num_combinations different combinations of hyperparameter settings
    hyp_params = random.sample(remove_duplicate_lists(hyp_param_lists), num_combinations)
    print("The hyperparameter lists are \n {}".format(hyp_params))
    
    # Create a bunch of n_gram LDA models and evaluate their coherence
    tuning_vals = []
    for hyp in hyp_params:
        hyp_vals = hyp
        print("Creating an LDA model with hyperparameters {}".format(hyp_vals))
        lda_model = create_lda_model(n_gram_bow_corpus, hyp[0], n_gram_dictionary, alpha=hyp[1], eta=hyp[2])
        coherence_val = lda_model_coherence(lda_model, n_gram_doc_corpus, n_gram_dictionary)
        hyp_vals.append(coherence_val)
        perplex_val = lda_model_perplexity(lda_model, n_gram_bow_corpus)
        hyp_vals.append(perplex_val)
        print("Hyperparameter settings, coherence, and perplexity values = {}".format(hyp_vals))
        tuning_vals.append(hyp_vals)
    
    df_tuning_vals = pd.DataFrame(tuning_vals, columns=['Num Topics', 'alpha', 'eta', 'Coherence', 'Perplexity'])
    return df_tuning_vals
    

In [65]:
# For the NMF topic model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

# scikit learn model persistence -- saving and loading
#from sklearn.externals import joblib
# DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and 
## will be removed in 0.23. Please import this functionality directly from joblib, 
## which can be installed with: pip install joblib. If this warning is raised when 
## loading pickled models, you may need to 
## re-serialize those models with scikit-learn 0.21+.

# joblib.dump(scikit_model, 'filename.pkl') # save the model
# scikit_model = joblib.load('filename.pkl') # load the model

In [66]:
def explore_topic(topic_model, topic_number, topn=20):
    """
    accept a user-supplied nlp_model and topic number and
    print out a formatted list of the top terms
    """
        
    print(u'{:20} {}'.format(u'Term', u'Frequency'))

    for term, frequency in topic_model.show_topic(topic_number, topn=topn):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [67]:
# From 
## https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df

def get_lda_topics(lda_model, num_topics, top_n=20):
    '''
    Show the words that make up the topics in an LDA topic model
    '''
    word_dict = {};
    for i in range(num_topics):
        words = lda_model.show_topic(i, topn = top_n);
        word_dict['Topic # ' + '{:02d}'.format(i)] = [i[0] for i in words];
    
    return pd.DataFrame(word_dict)

In [68]:
# For a given LDA model, get the dominant topic for each document in the corpus
def get_dominant_topics(lda_n_grams_model, bow_n_grams):
    t0 = time.time()
    corpus_info = lda_n_grams_model[bow_n_grams]
    t1 = time.time()
    print("Got topic model information for the entire corpus in {} seconds.".format(round(t1-t0, 3)))
    print("Starting to get the dominant topic for each document in the corpus...this takes a while...patience...")
    t2 = time.time()
    dom_topics_and_probs = [[sorted(corpus_info[n][0], key=lambda x: x[1], reverse=True)[0], n] for n in range(len(corpus_info))]
    t3 = time.time()
    print("Got dominant topic for each document in the corpus in {} seconds.".format(round(t3-t2, 3)))
    
    return dom_topics_and_probs

In [69]:
# Given a document number and an LDA model, get the document's dominant topic number
## the topic number's dominance probability, and the raw of the document.
def get_dominant_topic(doc_num, lda_n_grams_model, bow_n_grams, raw_text_corpus):
    doc_info = lda_n_grams_model[bow_n_grams[doc_num]]
    #print("doc_info {}".format(doc_info))
    dom_topic_and_prob = sorted(doc_info[0], key=lambda x: x[1], reverse=True)[0]
    dom_topic = dom_topic_and_prob[0]
    #print("dom_topic {}".format(dom_topic))
    dom_topic_prob = dom_topic_and_prob[1]
    dom_topic_keywords = lda_n_grams_model.show_topic(dom_topic, 10)
    dom_topic_keywords_list = [x[0] for x in dom_topic_keywords]
    doc_text = raw_text_corpus[doc_num]
    
    df_dom_results= pd.DataFrame([[dom_topic, dom_topic_prob, dom_topic_keywords_list, doc_text]], 
                                 columns=['Dominant Topic', 'Probability', 'Topic Keywords', 'Document Text'])
    
    return df_dom_results

In [70]:
# Given a topic number and the dominant topics for an LDA model from get_dominant_topics above, 
## get the topic's r most representative documents
def get_representative_docs(topic_num, dominant_topics_corpus, raw_text_corpus, num_display=5):
    # dominant_topics_corpus is the output of the get_dominant_topics function above
    
    # find the ones that match the topic number
    rel_docs = [el for el in dominant_topics_corpus if el[0][0] == topic_num]
    #print(rel_docs)
    # then sort them from higest to lowest to find the most representative docs for the given topic
    repr_docs = sorted(rel_docs, key=lambda x: x[0][1], reverse=True)[0:num_display]
    #print(repr_docs)
    
    # Pull the results together into a dataframe
    results = [[rep[1], rep[0][1], raw_text_corpus[rep[1]]] for rep in repr_docs]
    df_results = pd.DataFrame(results, columns=['Document Number', 'Probability of Topic for Document', 'Document Text'])
    print("Here are top {} documents where topic {} is dominant.".format(num_display, topic_num))
    
    return df_results
        

In [71]:
# From https://medium.com/ml2vec/topic-modeling-is-an-unsupervised-learning-approach-to-clustering-documents-to-discover-topics-fdfbf30e27df

def get_nmf_topics(nmf_model, num_topics, n_top_words=20):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    vectorizer = CountVectorizer(analyzer='word')
    vectorizer._validate_vocabulary()
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = nmf_model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [72]:
# From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# Finding the dominant topic in any document
def format_topics_sentences(topic_model, corpus, texts):
    # Initialize the output dataframe
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(topic_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = topic_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', '% Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


#### And here's how to get the dominant topics and display them ####
#df_topic_sents_keywords = format_topics_sentences(topic_model, corpus, texts)

# Format
#df_dominant_topic = df_topic_sents_keywords.reset_index()
#df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
#df_dominant_topic.head(10)

In [73]:
# From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# Find the most representative document for each topic
# Group top 5 sentences under each topic
#sent_topics_sorteddf_mallet = pd.DataFrame()

#sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

#for i, grp in sent_topics_outdf_grpd:
#    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
#                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
#                                            axis=0)

# Reset Index    
#sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
#sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
#sent_topics_sorteddf_mallet.head()

In [74]:
# From https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# Number of Documents for Each Topic
#topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
#topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
#topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
#df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
#df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
#df_dominant_topics

## Prep to display an LDA model using pyLDAvis

In [75]:
# pyLDAvis for visualizing topic models
import pyLDAvis
#import pyLDAvis.gensim ###DEPRECATED##
# Use this instead - see https://stackoverflow.com/questions/66759852/no-module-named-pyldavis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
import warnings

In [76]:
# Create LDA viz content
def prep_lda_viz(n_gram_lda_model, n_gram_bow, n_gram_dictionary):
    print("This can take a while...patience...")
    t0 = time.time()
    lda_viz_content = gensimvis.prepare(n_gram_lda_model, n_gram_bow, n_gram_dictionary)
    t1 = time.time()
    print("LDA Viz prep data created in {} seconds.".format(round(t1-t0, 3)))
    return lda_viz_content

  and should_run_async(code)


## Serialize (save and load) LDAvis content

In [77]:
# Serialize the pyLDAvis prepared file to disk
def save_lda_viz_content(lda_viz_content, file_path):
     with open(file_path, 'wb') as f:
        pickle.dump(lda_viz_content, f)

  and should_run_async(code)


In [78]:
# load the pre-prepared ldavis_content from disk
def load_lda_viz_content(file_path):
    with open(file_path, 'rb') as f:
        lda_viz_content = pickle.load(f)
        
    return lda_viz_content

  and should_run_async(code)


## Show LDA Visualization

In [79]:
def show_lda_viz(lda_viz_content): 
    return pyLDAvis.display(lda_viz_content)

  and should_run_async(code)


## t-SNE Visualizations

In [80]:
# Import the Scikit Learn t-SNE model
from sklearn.manifold import TSNE

  and should_run_async(code)


In [81]:
def prep_tsne_input(keyed_vectors, num_words=1000):
    
    '''
    Take a set of KeyedVectors produced by a Word2Vec or Doc2Vec model and prep it 
    for input into Scikit Learn's TSNE model.
    
    num_words cuts down the complexity by selecting a subset of words from the vocabulary 
    (the num_words most frequent)
    '''
    t0 = time.time()
    df_vecs = display_vectors(keyed_vectors)
    t1 = time.time()
    print("t-SNE input dataframe created in {:.2f} secs.".format(t1-t0))
    print("Creating t-SNE vectors ... this will take some time ...")
    
    # df_vecs is the input to the t-SNE model
    tsne = TSNE()
    tsne_input = df_vecs.head(num_words)
    t2 = time.time()
    tsne_vectors = tsne.fit_transform(tsne_input.values)
    t3 = time.time()
    print("t-SNE vectors created in {:.2f} secs.".format(t3-t2))
    
    # Convert the tsne_vectors into a dataframe
    # These can then be used to visualize t-SNE using Bokeh
    df_tsne_vectors = pd.DataFrame(tsne_vectors,
                                   index=pd.Index(tsne_input.index),
                                   columns=[u'x_coord', u'y_coord']
                                  )
    
    return df_tsne_vectors

  and should_run_async(code)


In [82]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value

output_notebook()

def plot_tsne(df_tsne_vectors, dot_color='blue', title_add=''):
    '''
    Visualize the t-SNE vectors using Bokeh.
    '''
    
    # Map the vocabulary to the t-SNE vectors
    df_tsne_vectors[u'word'] = df_tsne_vectors.index
    
    # add df_tsne_vectors as a ColumnDataSource for Bokeh
    plot_data = ColumnDataSource(df_tsne_vectors)
    
    # create the plot and configure the
    ## title, dimensions, and tools
    tsne_plot = figure(title=u't-SNE Word Embeddings of Corpus' + title_add,
                       plot_width = 800,
                       plot_height = 800,
                       tools= (u'pan, wheel_zoom, box_zoom,'
                               u'box_select, reset'),
                       active_scroll=u'wheel_zoom')

    # add a hover tool to display words on roll-over
    tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

    # draw the words as circles on the plot
    tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                     color=dot_color, line_alpha=0.2, fill_alpha=0.1,
                     size=10, hover_line_color=u'black')

    # configure visual elements of the plot
    tsne_plot.title.text_font_size = value(u'16pt')
    tsne_plot.xaxis.visible = False
    tsne_plot.yaxis.visible = False
    tsne_plot.grid.grid_line_color = None
    tsne_plot.outline_line_color = None

    # Display the plot
    show(tsne_plot);

  and should_run_async(code)


In [83]:
# Utility function that combines prep_tsne_input and tsne_plot above
def show_tsne(nlp_n_gram_model, num_words=1000, title_add="", dot_color="magenta"):
    # n_gram_model is for example doc2vec_model_2_grams
    # num_words sets a limit on the number of words considered -- lower the number, faster the plotting
    
    # Prep the data for the t-SNE plot
    ## NOTE: nlp_n_gram_model.wv produces the keyed vectors object for the nlp_model 
    ## both for doc2vec and word2vec
    df_tsne = prep_tsne_input(nlp_n_gram_model.wv, num_words=num_words)
    
    # Plot the t-SNE using Bokeh
    plot_tsne(df_tsne, dot_color=dot_color, title_add=title_add)
    
    return df_tsne

  and should_run_async(code)


## Visualizing NLP Model Vector Clusters

In [84]:
#### TO DO: Modify to handle Word2Vec models ####
# Assume you have the vectors from a doc2vec or word2vec model. We want to take the vectors and see how they cluster.
## Here we set up the number of clusters, run a K-Means model and see how the vectors fall into the various clusters or groups.
## We then take each vector, reduce it to 2 dimensions using Principal Component Analysis (PCA) and plot the clusters
## Requires Imports ##
# from sklearn.cluster import KMeans
## Modified from https://medium.com/@ermolushka/text-clusterization-using-python-and-doc2vec-8c499668fa61
#### NOTE: Plotting will return an error if a centroid has no data points attached to it. 
#### This can happen when the number of clusters is too high. The error can be avoided by plotting a simple matplotlib plot: together=2

def show_vec_clusters(nlp_n_grams_vecs, num_clusters, together=1):
    # nlp_n_grams_vecs are the vectors of an nlp model 
    ## which can be doc2vec or word2vec
    ## nlp_n_grams_vecs = doc2vec_n_grams_model.docvecs.vectors_docs
    ## word2vec models directly return the vecs 
    # together = 1 shows all clusters together 
    # together = 0 plots each cluster separately
    # any other value of together produces a simple matplotlib plot where the colors can be customized
    
    t0 = time.time()
    print("Creating a K-Means model for the document vectors...")
    kmeans_model = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=1000) 
    X = kmeans_model.fit(nlp_n_grams_vecs)
    labels=kmeans_model.labels_.tolist()
    centroids = kmeans_model.cluster_centers_
    l = kmeans_model.fit_predict(nlp_n_grams_vecs)
    t1 = time.time()
    print("K-Means model created in {} seconds".format(round(t1-t0)))
    
    # Reduce the dimensionality of the vectors
    t2 = time.time()
    print("Reducing the dimensionality of the vectors in the corpus...")
    pca = PCA(n_components=2).fit(nlp_n_grams_vecs)
    datapoints = pca.transform(nlp_n_grams_vecs)
    centroidpoints = pca.transform(centroids)
    t3 = time.time()
    print("Reduced dimensionality of the corpus vectors in {} seconds".format(round(t3-t2)))
    #print("centroidpoints = {}".format(centroidpoints))
    
    # Create dataframe for plotting in Seaborn
    t4 = time.time()
    print("Putting the data in the right format for plotting...")
    datapoints_labeled = [np.append(labels[i], datapoints[i]) for i in range(len(datapoints))]
    df_c = pd.DataFrame(datapoints_labeled, columns=["Group", "x", "y"])
    # Change the cluster numbers from floats to integers
    df_c["Group"] = df_c["Group"].astype(int)
    t5 = time.time()
    print("Data formatted for plotting in {} seconds".format(round(t5-t4)))
    
    t6 = time.time()
    if together == 1:
        #plot data with seaborn - all clusters together
        print("Starting to plot...")
        facet = sns.lmplot(data=df_c, x='x', y='y', hue='Group', height=6, aspect=2, fit_reg=False, legend=True, legend_out=False)
        # Uncomment these limits when plotting doc2vec or word2vec vectors that are within a tighter range
        #facet.set(xlim=(-10, 10))
        #facet.set(ylim=(-10,10))
        # Plot the centroids and name them
        for i in range(len(centroidpoints)):
            plt.text(centroidpoints[i,0]+0.05, centroidpoints[i,1]+0.05, str(i), fontsize=12, fontweight="bold")
            plt.scatter(centroidpoints[i,0], centroidpoints[i,1], marker='^', color='black', s=100);
    elif together == 0:
        # plot each cluster separately
        for i in range(len(centroidpoints)):
            facet = sns.lmplot(data=df_c[df_c['Group']==i], x='x', y='y', hue='Group', height=4, aspect=2, fit_reg=False, legend=False, legend_out=False)
            # Uncomment these limits when plotting doc2vec or word2vec vectors that are within a tighter range
            #facet.set(xlim=(-10, 10))
            #facet.set(ylim=(-10,10))
            facet.fig.suptitle("Group " + str(i))
            for i in range(len(centroidpoints)):
                plt.text(centroidpoints[i,0]+0.05, centroidpoints[i,1]+0.05, str(i), fontsize=12, fontweight="bold")
                plt.scatter(centroidpoints[i,0], centroidpoints[i,1], marker='^', color='black', s=100); 
    else:
        # Simple matplotlib plot
        fig, ax = plt.subplots(figsize=(10,8))
        colors = ["darkorange", "green", "magenta", "violet", "aqua", "lime", "yellow", "dodgerblue", "fuchsia", "maroon", "chocolate", "darksalmon"]
        label1 = colors[0:num_clusters]
        color = [label1[i] for i in labels]
        plt.scatter(datapoints[:, 0], datapoints[:, 1], c=color)
        plt.scatter(centroidpoints[:, 0], centroidpoints[:, 1], marker="^", s=150, c="#000000")
        plt.show();

    t7 = time.time()
    print("Plot complete in {} seconds".format(round(t7-t6)))
    print("Model to plot in {} seconds".format(round(t7-t0)))
    
    # Return the classification labels of each of the vectors for further analysis
    return labels

  and should_run_async(code)


In [85]:
#### TO DO: Modify to handle Word2Vec models ####

# Assume you have a doc2vec or word2vec model. We want to take the vectors and see how they cluster.
## Here we set up the number of clusters, run a K-Means model and see how the vectors fall into the various clusters or groups.
## We then take each vector, reduce it to 2 dimensions using Principal Component Analysis (PCA) and plot the clusters
## Requires Imports ##
# from sklearn.cluster import KMeans
## Modified from https://medium.com/@ermolushka/text-clusterization-using-python-and-doc2vec-8c499668fa61

def show_clusters(doc2vec_n_gram_model, num_clusters, together=1):
    # together = 1 shows all clusters together 
    # together = 0 plots each cluster separately
    # any other value of together produces a simple matplotlib plot where the colors can be customized
    
    t0 = time.time()
    print("Creating a K-Means model for the document vectors...")
    kmeans_model = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=1000) 
    X = kmeans_model.fit(doc2vec_n_gram_model.docvecs.vectors_docs)
    labels=kmeans_model.labels_.tolist()
    centroids = kmeans_model.cluster_centers_
    l = kmeans_model.fit_predict(doc2vec_n_gram_model.docvecs.vectors_docs)
    t1 = time.time()
    print("K-Means model created in {} seconds".format(round(t1-t0)))
    
    # Reduce the dimensionality of the vectors
    t2 = time.time()
    print("Reducing the dimensionality of the vectors in the corpus...")
    pca = PCA(n_components=2).fit(doc2vec_n_gram_model.docvecs.vectors_docs)
    datapoints = pca.transform(doc2vec_n_gram_model.docvecs.vectors_docs)
    centroidpoints = pca.transform(centroids)
    t3 = time.time()
    print("Reduced dimensionality of the corpus vectors in {} seconds".format(round(t3-t2)))
    
    # Create dataframe for plotting in Seaborn
    t4 = time.time()
    print("Putting the data in the right format for plotting...")
    datapoints_labeled = [np.append(labels[i], datapoints[i]) for i in range(len(datapoints))]
    df_c = pd.DataFrame(datapoints_labeled, columns=["Group", "x", "y"])
    # Change the cluster numbers from floats to integers
    df_c["Group"] = df_c["Group"].astype(int)
    t5 = time.time()
    print("Data formatted for plotting in {} seconds".format(round(t5-t4)))
    
    t6 = time.time()
    if together == 1:
        #plot data with seaborn - all clusters together
        print("Starting to plot...")
        facet = sns.lmplot(data=df_c, x='x', y='y', hue='Group', 
                           fit_reg=False, legend=True, legend_out=False, 
                           height=6, aspect=2)
        facet.set(xlim=(-10, 10))
        facet.set(ylim=(-10,10))
        # Plot the centroids and name them
        for i in range(len(centroidpoints)):
            plt.text(centroidpoints[i,0]+0.05, centroidpoints[i,1]+0.05, str(i), fontsize=12, fontweight="bold")
            plt.scatter(centroidpoints[i,0], centroidpoints[i,1], marker='^', color='black', s=100);
    elif together == 0:
        # plot each cluster separately
        for i in range(len(centroidpoints)):
            facet = sns.lmplot(data=df_c[df_c['Group']==i], x='x', y='y', hue='Group', height=4, aspect=2,  
                               fit_reg=False, legend=False, legend_out=False)
            facet.set(xlim=(-10, 10))
            facet.set(ylim=(-10,10))
            facet.fig.suptitle("Group " + str(i))
            for i in range(len(centroidpoints)):
                plt.text(centroidpoints[i,0]+0.05, centroidpoints[i,1]+0.05, str(i), fontsize=12, fontweight="bold")
                plt.scatter(centroidpoints[i,0], centroidpoints[i,1], marker='^', color='black', s=100); 
    else:
        # Simple matplotlib plot
        fig, ax = plt.subplots(figsize=(10,8))
        colors = ["darkorange", "green", "magenta", "violet", "aqua", "lime", "yellow", "dodgerblue", "fuchsia", "maroon", "chocolate", "darksalmon"]
        label1 = colors[0:num_clusters]
        color = [label1[i] for i in labels]
        plt.scatter(datapoints[:, 0], datapoints[:, 1], c=color)
        plt.scatter(centroidpoints[:, 0], centroidpoints[:, 1], marker="^", s=150, c="#000000")
        plt.show();

    t7 = time.time()
    print("Plot complete in {} seconds".format(round(t7-t6)))
    print("Model to plot in {} seconds".format(round(t7-t0)))
    
    # Return the classification labels of each of the vectors for further analysis
    return labels

  and should_run_async(code)
