In [1]:
import pandas as pd
import numpy as np
import gensim
from TopicModeling_SupportFunctions import get_texts_and_corpus,lda_gridsearch,get_top_distinct_words_per_topic

Load Review Text:

In [2]:
NYTReviews = pd.read_csv("data/NYTData_wReviewText.csv")
NYTReviews.drop(["Unnamed: 0"],axis=1,inplace=True)
review_text=NYTReviews[["review_text"]]

Reference for use of gensim library:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

LDA Hyperparameter Tuning For Highest Coherence:

In [None]:
# Takes a long time to run (5hrs) on last try on my machine:
grid_search_params={
    'validation_set_corpus_pct':[0.75,1],
    'topics_range':[8,10,12,14],
    'alpha':["symmetric","asymmetric"],
    'eta':["symmetric","auto",None],
    'minimum_probability':[0.001,0.01,0.1],
    'bigram_min_count':[3,5,7]
    }

model_results=lda_gridsearch(review_text,grid_search_params)
model_results.to_csv("lda_model_outputs/lda_tuning_results_01.csv",index=False)

In [3]:
model_results=pd.read_csv("lda_model_outputs/lda_tuning_results_01.csv")
model_results.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
0,0.75,8,symmetric,symmetric,0.001,3,0.240787
1,0.75,8,symmetric,symmetric,0.01,3,0.241541
2,0.75,8,symmetric,symmetric,0.1,3,0.241101
3,0.75,8,symmetric,auto,0.001,3,0.239596
4,0.75,8,symmetric,auto,0.01,3,0.239209


Build LDA Model with Parameters That Produced the Highest Coherence:

In [4]:
# Get Best Parameters from Grid Search:
best_params=model_results[model_results["coherence"]==model_results["coherence"].max()]
best_params.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
353,0.75,14,asymmetric,symmetric,0.1,7,0.267368


In [5]:
# Build LDA model using the best results:
corpus,id2word,texts = get_texts_and_corpus(review_text,bigram_min_count=best_params["topics_range"].values[0])

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=best_params["topics_range"].values[0], 
                                        random_state=42,
                                        chunksize=100,
                                        passes=10,
                                        alpha=best_params["alpha"].values[0],
                                        eta=best_params["eta"].values[0],
                                        minimum_probability=best_params["minimum_probability"].values[0])

Get Top Distinct Words within Each Topic

In [6]:
# Get Top Distinct Topics b/w selected topics:
top_distinct_words_per_topic=get_top_distinct_words_per_topic(lda_model,num_words_to_show=10)

In [8]:
top_distinct_words_per_topic_df = pd.DataFrame(top_distinct_words_per_topic)
top_distinct_words_per_topic_df = top_distinct_words_per_topic_df.transpose()
top_distinct_words_per_topic_df.columns = [f'Topic   #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]
top_distinct_words_per_topic_df

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14
0,danger,grace,claire,ray,exist,disney,wake,luce,lionel,joy,simone,sean,salvador,shark
1,refugee,russian,creature,third,landscape,franchise,network,tesla,shult,classic,garrone,ernesto,almodovar,godzilla
2,complex,energy,dragon,emily,production,exactly,mcbride,nassar,tyler,roger,ride,brittany,glory,spark
3,live_action,dolemite,unusual,affect,moondog,toy,display,helen,hare,truth,specie,revue,diane,wallace
4,national,swift,soundtrack,gay,lesson,sequel,obscure,doctor,killing,hotel,drop,orna,beanpole,sweet
5,jump,troll,grim,convey,fine,marvel,purpose,slim,tower,spectacle,prince,thunder,kitchen,entry
6,information,abandon,storm,rap,survive,actually,fern,athlete,abigail,real_life,brown,bannon,eastwood,reach
7,narrator,singe,penguin,guide,lily,stake,slow,lunch,district,zombie,cam,leroy,lawyer,oscar_nominate
8,campaign,album,ice,truck,worry,sure,split,macgowan,desperation,collective,horse,jodi,finch,palestinian
9,policy,industry,bernadine,extend,connect,gloria,routine,dominic,eiffel,eve,domino,suit,naple,frank


In [None]:
top_distinct_words_per_topic_df.to_csv("lda_model_outputs/top10_words_per_LDA_topic.csv",index=False)

Get Top Topics For Each Review:

In [36]:
all_topics = lda_model.get_document_topics(lda_model[corpus], minimum_probability=0.0)
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.columns = [f'Topic   #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]

In [37]:
all_topics_df.sample(5)

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10,Topic #11,Topic #12,Topic #13,Topic #14
562,0.081347,0.063319,0.55068,0.044518,0.038765,0.03433,0.030806,0.027938,0.025559,0.023553,0.021839,0.020357,0.019064,0.017925
679,0.081795,0.063827,0.052709,0.044891,0.039092,0.529063,0.031066,0.028174,0.025775,0.023752,0.022023,0.020529,0.019225,0.018077
312,0.081019,0.56197,0.052306,0.044527,0.038772,0.034336,0.030811,0.027943,0.025563,0.023557,0.021842,0.020361,0.019067,0.017928
1550,0.288821,0.063872,0.052502,0.04468,0.038902,0.323527,0.030914,0.028036,0.025648,0.023635,0.021915,0.020429,0.019131,0.017988
622,0.579746,0.063455,0.052278,0.044504,0.038749,0.034314,0.030792,0.027925,0.025547,0.023542,0.021829,0.020348,0.019055,0.017917


In [41]:
# Join NYT Reviews with Associated Topic Relatedness Values
NYTReviews.join(all_topics_df,how="left").to_csv("lda_model_outputs/NYT_Reviews_w_TopicRelatedness.csv",index=False)

In [12]:
# Assign Topic Based On Topic with Highest Probability for Each Review:
all_topics_df['Topic_Assignment'] = all_topics_df[list(all_topics_df.columns[:])].idxmax(axis=1)
NYTReviews.join(all_topics_df[['Topic_Assignment']],how="left").to_csv("lda_model_outputs/NYT_Reviews_w_TopicAssignment.csv",index=False)