In [1]:
import pandas as pd
import numpy as np
import gensim
from TopicModeling_SupportFunctions import get_texts_and_corpus,lda_gridsearch,get_top_distinct_words_per_topic

Load Plot Text:

In [2]:
omdb_df = pd.read_csv("data/joined_df.csv")
review_text=omdb_df[["Plot"]]
review_text.rename(columns={"Plot":"review_text"},inplace=True) # Just to match functions built for NYT reviews
review_text.fillna("NAN",inplace=True)

Reference for use of gensim library:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

LDA Hyperparameter Tuning For Highest Coherence:

In [3]:
# Takes a long time to run (5hrs) on last try on my machine:
grid_search_params={
    'validation_set_corpus_pct':[0.75,1],
    'topics_range':[8,10,12,14],
    'alpha':["symmetric","asymmetric"],
    'eta':["symmetric","auto",None],
    'minimum_probability':[0.001,0.01,0.1],
    'bigram_min_count':[3,5,7]
    }

model_results=lda_gridsearch(review_text,grid_search_params)
model_results.to_csv("lda_model_outputs/lda_tuning_results_03_plots.csv",index=False)

100%|██████████| 432/432 [1:47:18<00:00, 14.90s/it]


In [4]:
model_results=pd.read_csv("lda_model_outputs/lda_tuning_results_03_plots.csv")
model_results.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
0,0.75,8,symmetric,symmetric,0.001,3,0.35094
1,0.75,8,symmetric,symmetric,0.01,3,0.360117
2,0.75,8,symmetric,symmetric,0.1,3,0.35257
3,0.75,8,symmetric,auto,0.001,3,0.364469
4,0.75,8,symmetric,auto,0.01,3,0.353576


Build LDA Model with Parameters That Produced the Highest Coherence:

In [5]:
# Get Best Parameters from Grid Search:
best_params=model_results[model_results["coherence"]==model_results["coherence"].max()]
best_params.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
212,0.75,14,asymmetric,auto,0.1,5,0.407765


In [6]:
# Build LDA model using the best results:
corpus,id2word,texts = get_texts_and_corpus(review_text,bigram_min_count=best_params["topics_range"].values[0])

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=best_params["topics_range"].values[0], 
                                        random_state=42,
                                        chunksize=100,
                                        passes=10,
                                        alpha=best_params["alpha"].values[0],
                                        eta=best_params["eta"].values[0],
                                        minimum_probability=best_params["minimum_probability"].values[0])

Get Top Distinct Words within Each Topic

In [7]:
# Get Top Distinct Topics b/w selected topics:
top_distinct_words_per_topic=get_top_distinct_words_per_topic(lda_model,num_words_to_show=10)

In [8]:
top_distinct_words_per_topic_df = pd.DataFrame(top_distinct_words_per_topic)
top_distinct_words_per_topic_df = top_distinct_words_per_topic_df.transpose()
top_distinct_words_per_topic_df.columns = [f'Plot_Topic #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]
top_distinct_words_per_topic_df

Unnamed: 0,Plot_Topic #1,Plot_Topic #2,Plot_Topic #3,Plot_Topic #4,Plot_Topic #5,Plot_Topic #6,Plot_Topic #7,Plot_Topic #8,Plot_Topic #9,Plot_Topic #10,Plot_Topic #11,Plot_Topic #12,Plot_Topic #13,Plot_Topic #14
0,apartment,creative,spy,medium,soul,boyfriend,bill,length,speer,perry,open,gay,stutter,gang
1,farmer,traylor,exploration,quiet,nation,chip,luc,rein,pandemic,painting,yingye,voter,bolshevik,major
2,tape,offer,bond,list,album,wedding,popular,unprecedented,mad,military,strike,clarence,manage,agne
3,beauty,raise,wake,mind,singer,weed,internet,deep,unfinished,test,disease,assassin,business,faustin
4,half,cousin,control,sometimes,shimu,personality,violent,bee,wrong,special,navigate,sage,rugoff,xavier
5,red,worldwide,suddenly,cop,talent,road,teen,whale,describe,mark,holiday,nominate,marconi,focus
6,teenager,class,soviet,boss,german,stalin,seahorse,manuscript,indigenous,moment,succeed,immigrant,cheka,xavi
7,modern,iconic,law,investigate,violence,camp,domestic,germund,novel,korean,arrival,candidate,inner,remain
8,reality,inequality,winner,round,factory,curse,memory,research,security,hide,bloom,examine,busy,ago
9,pair,fashion,rescue,middle,letter,rule,avoid,scientist,surround,leonardo,yingying,support,uncover,peace


In [9]:
top_distinct_words_per_topic_df.to_csv("lda_model_outputs/top10_words_per_LDA_topic_03_plots.csv",index=False)

Get Top Topics For Each Review:

In [10]:
all_topics = lda_model.get_document_topics(lda_model[corpus], minimum_probability=0.0)
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.columns = [f'Plot_Topic #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]

In [11]:
all_topics_df.sample(5)

Unnamed: 0,Plot_Topic #1,Plot_Topic #2,Plot_Topic #3,Plot_Topic #4,Plot_Topic #5,Plot_Topic #6,Plot_Topic #7,Plot_Topic #8,Plot_Topic #9,Plot_Topic #10,Plot_Topic #11,Plot_Topic #12,Plot_Topic #13,Plot_Topic #14
476,0.080297,0.063306,0.052273,0.044518,0.038767,0.034332,0.030808,0.02794,0.02556,0.523008,0.02184,0.020359,0.019065,0.017927
1105,0.566576,0.070757,0.053601,0.045183,0.039324,0.034824,0.03125,0.02834,0.025927,0.023892,0.022153,0.02065,0.019339,0.018184
175,0.574094,0.065836,0.053065,0.044876,0.039062,0.034593,0.031042,0.028152,0.025755,0.023733,0.022006,0.020513,0.01921,0.018063
1177,0.083154,0.064718,0.052817,0.044737,0.038948,0.034492,0.030951,0.02807,0.025679,0.023664,0.021942,0.020453,0.512366,0.01801
274,0.573572,0.066078,0.053148,0.044905,0.039087,0.034615,0.031062,0.02817,0.025771,0.023748,0.02202,0.020527,0.019223,0.018074


In [12]:
# Join NYT Reviews with Associated Topic Relatedness Values
omdb_df.join(all_topics_df,how="left").to_csv("lda_model_outputs/Plots_w_TopicRelatedness_03_plots.csv",index=False)

In [13]:
# Assign Topic Based On Topic with Highest Probability for Each Review:
all_topics_df['Topic_Assignment'] = all_topics_df[list(all_topics_df.columns[:])].idxmax(axis=1)
omdb_df.join(all_topics_df[['Topic_Assignment']],how="left").to_csv("lda_model_outputs/Plots_w_TopicAssignment_03_plots.csv",index=False)