In [34]:
import pandas as pd
import numpy as np
import gensim
from TopicModeling_SupportFunctions import get_texts_and_corpus,lda_gridsearch,get_top_distinct_words_per_topic

Load Review Text:

In [35]:
NYTReviews = pd.read_csv("data/NYTData_wReviewText.csv",parse_dates=["publication_date","opening_date","date_updated"])
NYTReviews.drop(["Unnamed: 0"],axis=1,inplace=True)
NYTReviews=NYTReviews[NYTReviews["publication_date"]>="2020-01-01"] # Only Use 2020/2021 NYT Critic Choice Data
review_text=NYTReviews[["review_text"]]

Reference for use of gensim library:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

LDA Hyperparameter Tuning For Highest Coherence:

In [36]:
# Takes a long time to run (5hrs) on last try on my machine:
grid_search_params={
    'validation_set_corpus_pct':[0.75,1],
    'topics_range':[8,10,12,14],
    'alpha':["symmetric","asymmetric"],
    'eta':["symmetric","auto",None],
    'minimum_probability':[0.001,0.01,0.1],
    'bigram_min_count':[3,5,7]
    }

model_results=lda_gridsearch(review_text,grid_search_params)
model_results.to_csv("lda_model_outputs/lda_tuning_results_02.csv",index=False)

100%|██████████| 432/432 [2:50:20<00:00, 23.66s/it]  


In [37]:
model_results=pd.read_csv("lda_model_outputs/lda_tuning_results_02.csv")
model_results.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
0,0.75,8,symmetric,symmetric,0.001,3,0.242466
1,0.75,8,symmetric,symmetric,0.01,3,0.242102
2,0.75,8,symmetric,symmetric,0.1,3,0.242012
3,0.75,8,symmetric,auto,0.001,3,0.24264
4,0.75,8,symmetric,auto,0.01,3,0.242308


Build LDA Model with Parameters That Produced the Highest Coherence:

In [38]:
# Get Best Parameters from Grid Search:
best_params=model_results[model_results["coherence"]==model_results["coherence"].max()]
best_params.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
246,1.0,10,asymmetric,auto,0.001,5,0.266314


In [39]:
# Build LDA model using the best results:
corpus,id2word,texts = get_texts_and_corpus(review_text,bigram_min_count=best_params["topics_range"].values[0])

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=best_params["topics_range"].values[0], 
                                        random_state=42,
                                        chunksize=100,
                                        passes=10,
                                        alpha=best_params["alpha"].values[0],
                                        eta=best_params["eta"].values[0],
                                        minimum_probability=best_params["minimum_probability"].values[0])

Get Top Distinct Words within Each Topic

In [40]:
# Get Top Distinct Topics b/w selected topics:
top_distinct_words_per_topic=get_top_distinct_words_per_topic(lda_model,num_words_to_show=10)

In [41]:
top_distinct_words_per_topic_df = pd.DataFrame(top_distinct_words_per_topic)
top_distinct_words_per_topic_df = top_distinct_words_per_topic_df.transpose()
top_distinct_words_per_topic_df.columns = [f'Topic   #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]
top_distinct_words_per_topic_df

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10
0,bill,male,hair,holiday,maud,shark,cousin,animate,sport,bundy
1,reality,fantasy,fern,candyman,palmer,irene,whale,oscar_nominate,trilogy,rembrandt
2,push,singer,truffle,gawain,ali,patient,chef,program,franchise,ruth
3,stay,claim,card,gamhee,billy,clare,celebrity,entry,lean,cody
4,rock,else,influence,nanae,conflict,welcome,beastie,spirit,zombie,vivo
5,survivor,deal,ball,louis,patience,message,maori,wear,deena,sure
6,protest,wish,ottinger,beanpole,ailey,window,cartoon,bower,shadyside,tabrizi
7,national,central,schrader,eat,stutter,tend,makwa,burrow,reference,byrne
8,activist,stake,bloom,amin,mom,filmmake,aldo,letter,writer_director,alive
9,politic,fast,whole,rei,abigail,levy,puck,render,suspense,sign


In [42]:
top_distinct_words_per_topic_df.to_csv("lda_model_outputs/top10_words_per_LDA_topic_02.csv",index=False)

Get Top Topics For Each Review:

In [43]:
all_topics = lda_model.get_document_topics(lda_model[corpus], minimum_probability=0.0)
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.columns = [f'Topic   #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]

In [44]:
all_topics_df.sample(5)

Unnamed: 0,Topic #1,Topic #2,Topic #3,Topic #4,Topic #5,Topic #6,Topic #7,Topic #8,Topic #9,Topic #10
1353,0.102367,0.077976,0.062535,0.052367,0.045055,0.039535,0.53297,0.031754,0.028909,0.026532
905,0.599727,0.077858,0.06266,0.052445,0.045123,0.039593,0.035272,0.0318,0.028951,0.026571
1083,0.103618,0.077699,0.559725,0.052285,0.044983,0.039472,0.035163,0.031703,0.028863,0.02649
563,0.600472,0.078118,0.062533,0.052283,0.044969,0.039456,0.035149,0.03169,0.028851,0.026479
199,0.114312,0.078477,0.06276,0.052295,0.044997,0.039474,0.520628,0.031704,0.028863,0.02649


In [45]:
# Join NYT Reviews with Associated Topic Relatedness Values
NYTReviews.join(all_topics_df,how="left").to_csv("lda_model_outputs/NYT_Reviews_w_TopicRelatedness_02.csv",index=False)

In [46]:
# Assign Topic Based On Topic with Highest Probability for Each Review:
all_topics_df['Topic_Assignment'] = all_topics_df[list(all_topics_df.columns[:])].idxmax(axis=1)
NYTReviews.join(all_topics_df[['Topic_Assignment']],how="left").to_csv("lda_model_outputs/NYT_Reviews_w_TopicAssignment_02.csv",index=False)