In [None]:
#general imports
import pandas as pd
import numpy as np
from numpy import array
import pprint
import warnings
import string 
import pickle
import unicodedata
import re

#nltk imports 
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
from nltk.probability import FreqDist
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
stopword_list = nltk.corpus.stopwords.words('english')
from nltk.util import ngrams 

#sklean imports 
from sklearn.feature_extraction.text import CountVectorizer

#spaCy imports 
import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)

# plotting imports
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from wordcloud import WordCloud

#Gensim imports 
import gensim
from gensim.models.nmf import Nmf
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Set up log to terminal for Gensim 
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

#pyLDAvis import 
import pyLDAvis.gensim

#For progress bar to monitor status of things running 
from tqdm import tqdm

#Load contraction map python file 
from contractions import CONTRACTION_MAP

# Import Dataset

In [None]:
essay_df = pd.read_csv("essays_export.csv")

# Data Exploration

In [None]:
top_words = pd.Series(' '.join(essay_df['ESSAY']).lower().split()).value_counts()[:500]
top_words

wordsFiltered = {}
for key, value in top_words.items():
    if key not in stopword_list:
        wordsFiltered.setdefault(key, value)

#wordsFiltered
#top_words

In [None]:
# Uncleaned Data Top Tri-grams 

def get_top_ngram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) 
                  for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:10]

top_tri_grams=get_top_ngram(essay_df['ESSAY'],n=3)
x,y=map(list,zip(*top_tri_grams))
sns.set_context("talk")
sns.barplot(x=y,y=x, palette="muted").set_title('Top Trigrams from Essay Data',fontsize=25)

# Text Pre-Processing

In [None]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text, is_lower_case=False, stopwords=stopword_list):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def normalize_text(text): 
    text = expand_contractions(text)
    #lowercase  
    text = text.lower()
    #remove new line, carriage return, and tab characters 
    text = text.replace('\n', ' ').replace('\r', '').replace('\t', '')
    #remove unwanted characters 
    text = text.replace('¿', '')
    #remove accented characters 
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #remove punctuation 
    for p in string.punctuation: 
        text = text.replace(p, "")
    #remove stop words 
    text = remove_stopwords(text, is_lower_case=True)
    #lemmatize 
    text = lemmatize_text(text)
    # remove extra whitespace
    text = re.sub(' +', ' ', text)
    return(text)

In [None]:
essay_df['ESSAY'] = essay_df['ESSAY'].apply(normalize_text)
#essay_df['CLEAN_ESSAY'] = essay_df['ESSAY'].apply(normalize_text)

In [None]:
# Cleaned Data Top Tri-grams 

top_tri_grams=get_top_ngram(essay_df['ESSAY'],n=3)
x,y=map(list,zip(*top_tri_grams))
sns.set_context("talk")
sns.barplot(x=y,y=x).set_title('Top Trigrams from Essay Data',fontsize=25)

In [None]:
top_n_grams=get_top_ngram(essay_df['ESSAY'],n=1)
x,y=map(list,zip(*top_n_grams))
#plt.rcdefaults()
fig, ax = plt.subplots()
plt.style.use('tableau-colorblind10')
ax.barh(x, y, align='center')
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Count')
ax.yaxis.set_tick_params(labelsize='large')
ax.set_title('Top Words in Essay Data', fontsize=20)
plt.tight_layout()
plt.style.use('tableau-colorblind10')
plt.savefig("Top_Essay_Words.png")
plt.show()

# Tokenization

In [None]:
docs = essay_df.ESSAY.tolist()

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

# Bi-grams

In [None]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

# Initialize Gensim Dictionary and Filter Extremes (based on word frequency)

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.50)

# Bag-of-words Vectorization 

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# Initial LDA Model (Gensim Standard Implementation)

In [None]:
# Train LDA model.

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [None]:
top_topics = lda_model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
lda_initial = {'Type': 'Initial LDA', 'Coherence Score': coherence_lda, 'Number of Topics': 10}
initial_models_df = pd.DataFrame(lda_initial, index=[0])
initial_models_df

In [None]:
def plot_lda_vis(lda_model, bow_corpus, dic):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dic)
    return vis 

plot_lda_vis(lda_model, corpus, dictionary) 

In [None]:
%matplotlib inline

def wordcloud_topics_gensim(model, no_top_words=40):
    for topic in range(0, model.num_topics):
        size = {}
        for (word, prob) in model.show_topic(topic, topn=no_top_words):
            size[word] = prob
        wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
        wc.generate_from_frequencies(size)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        # if you don't want to save the topic model, comment the next line
        plt.savefig(f'{model}_topic{topic}.png')
        
wordcloud_topics_gensim(lda_model)

In [None]:
initial_lda_topics = [[(term, round(wt, 3)) 
               for term, wt in lda_model.show_topic(n, topn=20)] 
                   for n in range(0, lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)
initial_lda_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in initial_lda_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, lda_model.num_topics+1)]
                         )
#initial_lda_topics_df

initial_lda_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
initial_lda_topics_df = initial_lda_topics_df.style.set_properties(**{'text-align': 'left'})

initial_lda_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})

initial_lda_topics_df = initial_lda_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])

initial_lda_topics_df

# Initial Non Negative Matrix Factorization Topic Model

In [None]:
#Non Negative Matrix Factorization Topic Modeling 
nmf_gensim = Nmf(corpus, num_topics=10, id2word=id2word, passes=passes)

In [None]:
top_topics = nmf_gensim.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
cv = CoherenceModel(model=nmf_gensim, texts=docs, dictionary=dictionary, coherence='c_v')
cv_nnmf = cv.get_coherence()
print('\nCoherence Score: ', cv_nnmf)

In [None]:
nmf_initial = {'Type': 'Initial NMF', 'Coherence Score': cv_nnmf, 'Number of Topics': 10}
initial_models_df = initial_models_df.append(nmf_initial, ignore_index = True)
initial_models_df

In [None]:
wordcloud_topics_gensim(nmf_gensim)

In [None]:
initial_nmf_topics = [[(term, round(wt, 3)) 
               for term, wt in nmf_gensim.show_topic(n, topn=20)] 
                   for n in range(0, nmf_gensim.num_topics)]

pd.set_option('display.max_colwidth', -1)
initial_nmf_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in initial_nmf_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, nmf_gensim.num_topics+1)]
                         )

#initial_nmf_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
initial_nmf_topics_df = initial_nmf_topics_df.style.set_properties(**{'text-align': 'left'})

initial_nmf_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})

initial_nmf_topics_df = initial_nmf_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])

initial_nmf_topics_df

# Initial MALLET-LDA Model 

In [None]:
mallet_path = 'mallet-2.0.8/bin/mallet' 
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=id2word)


In [None]:
cv = CoherenceModel(model=ldamallet, texts=docs, dictionary=dictionary, coherence='c_v')
cv_mallet = cv.get_coherence()
print('\nCoherence Score: ', cv_mallet)

In [None]:
mallet_initial = {'Type': 'Initial MALLET-LDA', 'Coherence Score': cv_mallet, 'Number of Topics': 10}
initial_models_df = initial_models_df.append(mallet_initial, ignore_index = True)
initial_models_df

In [None]:
wordcloud_topics_gensim(ldamallet)

In [None]:
mallet_lda_model=gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(mallet_lda_model, corpus, dictionary, sort_topics=False)
vis

In [None]:
top_topics = mallet_lda_model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

In [None]:
initial_mallet_topics = [[(term, round(wt, 3)) 
               for term, wt in mallet_lda_model.show_topic(n, topn=20)] 
                   for n in range(0, mallet_lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)
initial_mallet_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in initial_mallet_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, mallet_lda_model.num_topics+1)]
                         )

#initial_nmf_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
initial_mallet_topics_df = initial_mallet_topics_df.style.set_properties(**{'text-align': 'left'})

initial_mallet_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})

initial_mallet_topics_df = initial_mallet_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])

initial_mallet_topics_df

# Model Selection and Tuning for Number of Topics 

## Latent Dirichlet Allocation Model Selection 

In [None]:
from tqdm import tqdm

def lda_coherence_generator(corpus, id2word, chunksize=2000, texts=docs, alpha='auto', eta='auto', iterations=400, passes=20, eval_every=None, 
                                    start_topic_count=5, end_topic_count=15, step=1):
    
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        model = LdaModel(corpus=corpus, 
                         id2word=id2word, 
                         chunksize=chunksize, 
                         alpha=alpha,
                         eta=eta,
                         iterations=iterations,
                         num_topics=topic_nums, 
                         passes=passes,
                         eval_every=eval_every)
        
        coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        #top_topics = model.top_topics(corpus)
        # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
       # avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
        coherence_scores.append(coherence_lda)
        models.append(model)
    
    return models, coherence_scores

In [None]:
lda_models, coherence_scores_lda = lda_coherence_generator(corpus=corpus, id2word=id2word, start_topic_count=5, end_topic_count=30, step=1)


In [None]:
lda_coherence_df = pd.DataFrame({'Number of Topics': range(5, 31, 1),
                             'Coherence Score': np.round(coherence_scores_lda, 4)})
lda_coherence_df = lda_coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(26)
lda_coherence_df.style.set_caption("LDA Models")

In [None]:
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(5, 31, 1)
y_ax = coherence_scores_lda
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c='r')
plt.ylim(.25, .35)
#plt.axhline(y=0.535, c='k', linestyle='--', linewidth=2)
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
yl = plt.ylabel('Coherence Score')
plt.title("LDA Coherence Scores as a Function of Number of Topics", fontweight="bold")
plt.savefig("LDA_Graph.png")

In [None]:
#best_lda_index = lda_coherence_df.index[lda_coherence_df['Coherence Score'].idxmax()]
print("The Best LDA model is:")

best_lda_index_list = lda_coherence_df['Coherence Score'].nlargest(1).index.tolist()
index_list = [str(integer) for integer in best_lda_index_list]
index_str = "".join(index_list)
best_lda_index = int(index_str)

lda_coherence_df.loc[best_lda_index_list]

In [None]:
#Save best model details 
lda_model_best = lda_coherence_df.loc[best_lda_index]
best_lda = lda_models[best_lda_index]
lda_best = {'Type': 'LDA', 'Coherence Score': lda_model_best['Coherence Score'], 'Number of Topics': lda_model_best['Number of Topics'], 'Name of Model': 'lda_model_best'}
all_best_models_df = pd.DataFrame(lda_best, index=[0])

# Show best LDA model's topics 
def show_me_topics(model, corpus, num_topics):
    top_topics = model.top_topics(corpus)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    pprint(top_topics)

#Show best model's topics 
show_me_topics(best_lda, corpus, num_topics=lda_model_best['Number of Topics'])

In [None]:
plot_lda_vis(best_lda, corpus, dictionary) 

In [None]:
best_lda_topics = [[(term, round(wt, 3)) 
               for term, wt in best_lda.show_topic(n, topn=20)] 
                   for n in range(0, best_lda.num_topics)]

pd.set_option('display.max_colwidth', -1)
best_lda_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in best_lda_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, best_lda.num_topics+1)]
                         )
#initial_lda_topics_df
#best_lda_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
best_lda_topics_df = best_lda_topics_df.style.set_properties(**{'text-align': 'left'})
best_lda_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})
best_lda_topics_df = best_lda_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])
best_lda_topics_df

In [None]:
# Pickle top LDA Model 

filename = 'best_LDA_model_april_27_2021.sav'
#pickle.dump(best_lda, open(filename, 'wb'))

## Non-Negative Matrix Factorization Model Selection 

In [None]:
def nnmf_coherence_generator(corpus, id2word, chunksize=2000, texts=docs, alpha='auto', eta='auto', iterations=400, passes=20, eval_every=None, 
                                    start_topic_count=5, end_topic_count=15, step=1):
    
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        model = Nmf(corpus, num_topics=topic_nums, id2word=id2word, passes=passes)
        cv = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
        cv_nnmf = cv.get_coherence()
        coherence_scores.append(cv_nnmf)
        models.append(model)
    
    return models, coherence_scores

In [None]:
nmf_models, coherence_scores_nmf = nnmf_coherence_generator(corpus=corpus, id2word=id2word, start_topic_count=5, end_topic_count=30, step=1)

In [None]:
nmf_coherence_df = pd.DataFrame({'Number of Topics': range(5, 31, 1),
                             'Coherence Score': np.round(coherence_scores_nmf, 4)})
nmf_coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(26)

In [None]:
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(5, 31, 1)
y_ax = coherence_scores_nmf
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c='r')
plt.ylim(.3, .4)
#plt.axhline(y=0.4, c='k', linestyle='--', linewidth=2)
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
yl = plt.ylabel('Coherence Score')
plt.title("NMF Coherence Scores as a Function of Number of Topics", fontweight="bold")
plt.savefig("NMF_Graph.png")

In [None]:
#best_nmf_index = nmf_coherence_df.index[nmf_coherence_df['Coherence Score'].idxmax()]
#nmf_coherence_df.loc[best_nmf_index]

print("The Best NMF model is:")

best_nmf_index_list = nmf_coherence_df['Coherence Score'].nlargest(1).index.tolist()
nmf_index_list = [str(integer) for integer in best_nmf_index_list]
nmf_index_str = "".join(nmf_index_list)
best_nmf_index = int(nmf_index_str)

nmf_coherence_df.loc[best_nmf_index_list]

In [None]:
#Save best model details 
nmf_model_best = nmf_coherence_df.loc[best_nmf_index]
best_nmf = nmf_models[best_nmf_index]
nmf_best = {'Type': 'NMF', 'Coherence Score': nmf_model_best['Coherence Score'], 'Number of Topics': nmf_model_best['Number of Topics'] , 'Name of Model': 'nmf_model_best'}
all_best_models_df = all_best_models_df.append(nmf_best, ignore_index = True)

#Show best model's topics 
show_me_topics(best_nmf, corpus, num_topics)

In [None]:
best_nmf_topics = [[(term, round(wt, 3)) 
               for term, wt in best_nmf.show_topic(n, topn=20)] 
                   for n in range(0, best_nmf.num_topics)]

pd.set_option('display.max_colwidth', -1)
best_nmf_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in best_nmf_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, best_nmf.num_topics+1)]
                         )
#initial_lda_topics_df
#best_lda_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
best_nmf_topics_df = best_nmf_topics_df.style.set_properties(**{'text-align': 'left'})
best_nmf_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})
best_nmf_topics_df = best_nmf_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])
best_nmf_topics_df

In [None]:
all_best_models_df

In [None]:
# Pickle top NMF Model 

filename = 'best_NMF_model_april_27_2021.sav'
#pickle.dump(best_nmf, open(filename, 'wb'))

## LDA Mallet Topic Model Selection 

In [None]:
def mallet_coherence_generator(corpus, id2word, chunksize=2000, texts=docs, alpha='auto', eta='auto', 
                               iterations=400, passes=20, eval_every=None, mallet_path='mallet-2.0.8/bin/mallet',
                               start_topic_count=5, end_topic_count=15, step=1):
    
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=topic_nums, id2word=id2word)
        cv = CoherenceModel(model=ldamallet, texts=docs, dictionary=dictionary, coherence='c_v')
        cv_mallet = cv.get_coherence()
        coherence_scores.append(cv_mallet)
        models.append(ldamallet)
    
    return models, coherence_scores

In [None]:
mallet_models, coherence_scores_mallet = mallet_coherence_generator(corpus=corpus, id2word=id2word, start_topic_count=5, end_topic_count=30, step=1)

In [None]:
mallet_coherence_df = pd.DataFrame({'Number of Topics': range(5, 31, 1),
                             'Coherence Score': np.round(coherence_scores_mallet, 4)})
mallet_coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(26)

In [None]:
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(5, 31, 1)
y_ax = coherence_scores_mallet
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c='r')
plt.ylim(.35, .45)
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
yl = plt.ylabel('Coherence Score')
plt.title("MALLET-LDA Coherence Scores as a Function of Number of Topics", fontweight="bold")
plt.savefig("MALLET_LDA _Graph.png")

In [None]:
# Find model with highest coherence score 
#best_mallet_index = mallet_coherence_df.index[mallet_coherence_df['Coherence Score'].idxmax()]
#mallet_coherence_df.loc[best_mallet_index]

print("The Best MALLET-LDA model is:")
best_mallet_index_list = mallet_coherence_df['Coherence Score'].nlargest(1).index.tolist()
mallet_index_list = [str(integer) for integer in best_mallet_index_list]
mallet_index_str = "".join(mallet_index_list)
best_mallet_index = int(mallet_index_str)

mallet_coherence_df.loc[best_mallet_index_list]

In [None]:
#Save best model details 
mallet_model_best = mallet_coherence_df.loc[best_mallet_index]
best_mallet = mallet_models[best_mallet_index]
mallet_best = {'Type': 'MALLET-LDA', 'Coherence Score': mallet_model_best['Coherence Score'], 'Number of Topics': mallet_model_best['Number of Topics'] , 'Name of Model': 'mallet_model_best'}
all_best_models_df = all_best_models_df.append(mallet_best, ignore_index = True)

In [None]:
#convert to Gensim LDA model 
best_mallet_lda_model=gensim.models.wrappers.ldamallet.malletmodel2ldamodel(best_mallet)

#Show best model's topics 
show_me_topics(best_mallet_lda_model, corpus, mallet_model_best['Number of Topics'] )

In [None]:
best_mallet_topics = [[(term, round(wt, 3)) 
               for term, wt in best_mallet_lda_model.show_topic(n, topn=20)] 
                   for n in range(0, best_mallet_lda_model.num_topics)]

pd.set_option('display.max_colwidth', -1)
best_mallet_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in best_mallet_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, best_mallet_lda_model.num_topics+1)]
                         )
#initial_lda_topics_df
#best_lda_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
best_mallet_topics_df = best_mallet_topics_df.style.set_properties(**{'text-align': 'left'})
best_mallet_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})
best_mallet_topics_df = best_mallet_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])
best_mallet_topics_df

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(best_mallet_lda_model, corpus, dictionary, sort_topics=False)
vis

In [None]:
# Pickle top MALLET Model 
best_mallet = mallet_models[21]
filename = 'best_MALLET_model_april_27_2021.sav'
#pickle.dump(best_mallet_lda_model, open(filename, 'wb'))

mallet_choice_2_model = mallet_models[8]
file2name = 'MALLET_second_april_27_2021.sav'
#pickle.dump(best_mallet_lda_model, open(file2name, 'wb'))

# Best Models Comparison and Final Model Selection 

In [None]:
plt.style.use('tableau-colorblind10')
%matplotlib inline

x_ax = range(5, 31, 1)
y_1 = coherence_scores_mallet
y_2 = coherence_scores_nmf
y_3 = coherence_scores_lda

# plot lines
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_1, label = "MALLET-LDA")
plt.plot(x_ax, y_2, label = "NMF")
plt.plot(x_ax, y_3, label = "LDA")
plt.legend()
plt.ylim(.25, .45)
plt.rcParams['figure.facecolor'] = 'white'
#ax.yaxis.set_tick_params(labelsize='large')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc=2, prop={'size': 15})
xl = plt.xlabel('Number of Topics', fontsize=15)
yl = plt.ylabel('Coherence Score', fontsize=15)
plt.title("Coherence Scores as a Function of Number of Topics", fontweight="bold", fontsize=20)
plt.savefig("All_MODELS_Graph2.png")

In [None]:
print("The top models for each method")
all_best_models_df

In [None]:
best_model_index = all_best_models_df.index[all_best_models_df['Coherence Score'].idxmax()]
print("The Best Model is:")
all_best_models_df.loc[best_model_index]

## Comparing Other Top MALLET Models 

### MALLET-LDA Model with Second Best Coherence Score

In [None]:
mallet_choice_2_model = mallet_models[23]
#convert to Gensim LDA model 
mallet_second_best_model=gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_choice_2_model)

#Show best model's topics 
show_me_topics(mallet_second_best_model, corpus, 28)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(mallet_second_best_model, corpus, dictionary, sort_topics=False)
vis

### MALLET-LDA Model with Third Best Coherence Score

In [None]:
mallet_choice_3_model = mallet_models[22]
#convert to Gensim LDA model 
mallet_3_model=gensim.models.wrappers.ldamallet.malletmodel2ldamodel(mallet_choice_3_model)

#Show best model's topics 
show_me_topics(mallet_3_model, corpus, 27)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(mallet_3_model, corpus, dictionary, sort_topics=False)
vis

# Final Model Summary 

In [None]:
mallet_2_topics = [[(term, round(wt, 3)) 
               for term, wt in mallet_second_best_model.show_topic(n, topn=20)] 
                   for n in range(0, mallet_second_best_model.num_topics)]

pd.set_option('display.max_colwidth', -1)
mallet_2_topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in mallet_2_topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, mallet_second_best_model.num_topics+1)]
                         )
#initial_lda_topics_df
#best_lda_topics_df['Label'] = ['Random', 'Community Health', 'Engineering / Biomedical', 'Disease / Illness', 'Humans / Culture', 'Camp/Childhood', 'Med Experiences', 'Academics', 'Random', 'Community Service']
mallet_2_topics_df = mallet_2_topics_df.style.set_properties(**{'text-align': 'left'})
mallet_2_topics_df.set_properties(subset=['Terms per Topic'], **{'width': '500px'})
mallet_2_topics_df = mallet_2_topics_df.set_table_styles(
[dict(selector = 'th', props=[('text-align', 'left')])])
mallet_2_topics_df

In [None]:
wordcloud_topics_gensim(mallet_second_best_model)

# Final Model Feature Extraction

In [None]:
train_vecs = []
for i in range(len(essay_df.ESSAY)):
    top_topics = mallet_second_best_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(28)]
    #topic_vec.extend([essay_df.iloc[i].MED_AAMC_ID]) # removing from capstone deliverable for data privacy reasons
    #topic_vec.extend([essay_df.iloc[i].ESSAY]) # removing from capstone deliverable for data privacy reasons
    train_vecs.append(topic_vec)

In [None]:
feature_df = pd.DataFrame(train_vecs)
feature_df.columns = ['Humanities',
'Doctor Relationship',
'Summer Experience',
'Decision Process',
'Service',
'Death / Suffering ',
'Nursing Home Experience',
'Healthcare Access & Equity',
'Public Health',
'Practice Medicine',
'Observe / Shadow',
'Anatomy Interest',
'Athletics',
'Physician Skills',
'Impact',
'Children / Childhood',
'Moments',
'Academics',
'Culture / Travel',
'Emergency Medicine',
'Disease / Treatment',
'Family Illness',
'Rural Community',
'Problem Solving',
'Become Doctor',
'Goals / Tenacity',
'Lab Experience',
'Biomedical Engineering']
#'AAMC_ID', # removing from capstone deliverable for data privacy reasons
#'ESSAY' # removing from capstone deliverable for data privacy reasons

feature_df.head()

In [None]:
tm_results = mallet_second_best_model[corpus]
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] 
                     for topics in tm_results]

feature_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
feature_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
feature_df['Topic Desc'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]
conditions = [
    (feature_df['Dominant Topic'] == 1),
    (feature_df['Dominant Topic'] == 2),
    (feature_df['Dominant Topic'] == 3),
    (feature_df['Dominant Topic'] == 4),
    (feature_df['Dominant Topic'] == 5),
    (feature_df['Dominant Topic'] == 6),
    (feature_df['Dominant Topic'] == 7),
    (feature_df['Dominant Topic'] == 8),
    (feature_df['Dominant Topic'] == 9),
    (feature_df['Dominant Topic'] == 10),
    (feature_df['Dominant Topic'] == 11),
    (feature_df['Dominant Topic'] == 12),
    (feature_df['Dominant Topic'] == 13),
    (feature_df['Dominant Topic'] == 14),
    (feature_df['Dominant Topic'] == 15),
    (feature_df['Dominant Topic'] == 16),
    (feature_df['Dominant Topic'] == 17),
    (feature_df['Dominant Topic'] == 18),
    (feature_df['Dominant Topic'] == 19),
    (feature_df['Dominant Topic'] == 20),
    (feature_df['Dominant Topic'] == 21),
    (feature_df['Dominant Topic'] == 22),
    (feature_df['Dominant Topic'] == 23),
    (feature_df['Dominant Topic'] == 24),
    (feature_df['Dominant Topic'] == 25),
    (feature_df['Dominant Topic'] == 26),
    (feature_df['Dominant Topic'] == 27),
    (feature_df['Dominant Topic'] == 28),]
choices = ['Humanities',
'Doctor Relationship',
'Summer Experience',
'Decision Process',
'Service',
'Death / Suffering ',
'Nursing Home Experience',
'Healthcare Access & Equity',
'Public Health',
'Practice Medicine',
'Observe / Shadow',
'Anatomy Interest',
'Athletics',
'Physician Skills',
'Impact',
'Children / Childhood',
'Moments',
'Academics',
'Culture / Travel',
'Emergency Medicine',
'Disease / Treatment',
'Family Illness',
'Rural Community',
'Problem Solving',
'Become Doctor',
'Goals / Tenacity',
'Lab Experience',
'Biomedical Engineering']
feature_df['Dominant Topic Label'] = np.select(conditions, choices)

#Hiding Output for data privacy Reasons 
#feature_df.head()

In [None]:
pd.set_option('display.max_colwidth', 200)
topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({
                                                'Dominant Topic': {
                                                    'Doc Count': np.size,
                                                    '% Total Docs': np.size }
                                              })
topic_stats_df = topic_stats_df['Dominant Topic'].reset_index()
topic_stats_df['% Total Docs'] = topic_stats_df['% Total Docs'].apply(lambda row: round((row*100) / len(essay_df.ESSAY), 2))
conditions = [
    (topic_stats_df['Dominant Topic'] == 1),
    (topic_stats_df['Dominant Topic'] == 2),
    (topic_stats_df['Dominant Topic'] == 3),
    (topic_stats_df['Dominant Topic'] == 4),
    (topic_stats_df['Dominant Topic'] == 5),
    (topic_stats_df['Dominant Topic'] == 6),
    (topic_stats_df['Dominant Topic'] == 7),
    (topic_stats_df['Dominant Topic'] == 8),
    (topic_stats_df['Dominant Topic'] == 9),
    (topic_stats_df['Dominant Topic'] == 10),
    (topic_stats_df['Dominant Topic'] == 11),
    (topic_stats_df['Dominant Topic'] == 12),
    (topic_stats_df['Dominant Topic'] == 13),
    (topic_stats_df['Dominant Topic'] == 14),
    (topic_stats_df['Dominant Topic'] == 15),
    (topic_stats_df['Dominant Topic'] == 16),
    (topic_stats_df['Dominant Topic'] == 17),
    (topic_stats_df['Dominant Topic'] == 18),
    (topic_stats_df['Dominant Topic'] == 19),
    (topic_stats_df['Dominant Topic'] == 20),
    (topic_stats_df['Dominant Topic'] == 21),
    (topic_stats_df['Dominant Topic'] == 22),
    (topic_stats_df['Dominant Topic'] == 23),
    (topic_stats_df['Dominant Topic'] == 24),
    (topic_stats_df['Dominant Topic'] == 25),
    (topic_stats_df['Dominant Topic'] == 26),
    (topic_stats_df['Dominant Topic'] == 27),
    (topic_stats_df['Dominant Topic'] == 28),]
choices = ['Humanities',
'Doctor Relationship',
'Summer Experience',
'Decision Process',
'Service',
'Death / Suffering ',
'Nursing Home Experience',
'Healthcare Access & Equity',
'Public Health',
'Practice Medicine',
'Observe / Shadow',
'Anatomy Interest',
'Athletics',
'Physician Skills',
'Impact',
'Children / Childhood',
'Moments',
'Academics',
'Culture / Travel',
'Emergency Medicine',
'Disease / Treatment',
'Family Illness',
'Rural Community',
'Problem Solving',
'Become Doctor',
'Goals / Tenacity',
'Lab Experience',
'Biomedical Engineering']
topic_stats_df['Dominant Topic Label'] = np.select(conditions, choices)
topic_stats_df['Topic Desc'] = [topics_df.iloc[t]['Terms per Topic'] for t in range(len(topic_stats_df))]

#Hiding Output for data privacy Reasons 
#topic_stats_df

## Reviewing Representative Documents 

In [None]:
#Hiding Output for Data Privacy Reasons 

new_df = corpus_topic_df.groupby('Dominant Topic').apply(lambda topic_set: (topic_set.sort_values(by=['Contribution %'], 
                                                                                         ascending=False)
                                                                             .iloc[0]))

In [None]:
#Hiding Output for Data Privacy Reasons 
#new_df.to_csv('dominant_essay_topic.csv')