In [1]:
import pandas as pd
import numpy as np
import gensim
from TopicModeling_SupportFunctions import get_texts_and_corpus,lda_gridsearch,get_top_distinct_words_per_topic

Load Plot Text:

In [2]:
omdb_df = pd.read_csv("data/joined_df.csv")
review_text=omdb_df[["Plot"]]
review_text.rename(columns={"Plot":"review_text"},inplace=True) # Just to match functions built for NYT reviews
review_text.fillna("NAN",inplace=True)

Reference for use of gensim library:
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

LDA Hyperparameter Tuning For Highest Coherence:

In [3]:
# Takes a long time to run (5hrs) on last try on my machine:
grid_search_params={
    'validation_set_corpus_pct':[0.75,1],
    'topics_range':[8,10,12,14],
    'alpha':["symmetric","asymmetric"],
    'eta':["symmetric","auto",None],
    'minimum_probability':[0.001,0.01,0.1],
    'bigram_min_count':[3,5,7]
    }

model_results=lda_gridsearch(review_text,grid_search_params)
model_results.to_csv("lda_model_outputs/Plots_lda_tuning_results_03_plots.csv",index=False)

100%|██████████| 432/432 [1:47:18<00:00, 14.90s/it]


In [5]:
model_results=pd.read_csv("lda_model_outputs/Plots_lda_tuning_results_03_plots.csv")
model_results.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
0,0.75,8,symmetric,symmetric,0.001,3,0.35094
1,0.75,8,symmetric,symmetric,0.01,3,0.360117
2,0.75,8,symmetric,symmetric,0.1,3,0.35257
3,0.75,8,symmetric,auto,0.001,3,0.364469
4,0.75,8,symmetric,auto,0.01,3,0.353576


Build LDA Model with Parameters That Produced the Highest Coherence:

In [6]:
# Get Best Parameters from Grid Search:
best_params=model_results[model_results["coherence"]==model_results["coherence"].max()]
best_params.head()

Unnamed: 0,validation_set_corpus_pct,topics_range,alpha,eta,minimum_probability,bigram_min_count,coherence
212,0.75,14,asymmetric,auto,0.1,5,0.407765


In [7]:
# Build LDA model using the best results:
corpus,id2word,texts = get_texts_and_corpus(review_text,bigram_min_count=best_params["topics_range"].values[0])

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=best_params["topics_range"].values[0], 
                                        random_state=42,
                                        chunksize=100,
                                        passes=10,
                                        alpha=best_params["alpha"].values[0],
                                        eta=best_params["eta"].values[0],
                                        minimum_probability=best_params["minimum_probability"].values[0])

Get Top Distinct Words within Each Topic

In [8]:
# Get Top Distinct Topics b/w selected topics:
top_distinct_words_per_topic=get_top_distinct_words_per_topic(lda_model,num_words_to_show=10)

In [9]:
top_distinct_words_per_topic_df = pd.DataFrame(top_distinct_words_per_topic)
top_distinct_words_per_topic_df = top_distinct_words_per_topic_df.transpose()
top_distinct_words_per_topic_df.columns = [f'Plot_Topic #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]
top_distinct_words_per_topic_df

Unnamed: 0,Plot_Topic #1,Plot_Topic #2,Plot_Topic #3,Plot_Topic #4,Plot_Topic #5,Plot_Topic #6,Plot_Topic #7,Plot_Topic #8,Plot_Topic #9,Plot_Topic #10,Plot_Topic #11,Plot_Topic #12,Plot_Topic #13,Plot_Topic #14
0,apartment,culture,spy,case,soul,drug,bill,length,speer,intimate,open,gay,stutter,gang
1,killer,land,exploration,grief,nation,boyfriend,popular,image,vera,perry,yingye,voter,bolshevik,major
2,global,embark,unexpected,medium,album,news,violent,video,director,painting,disease,remarkable,say,agne
3,large,traylor,path,quiet,singer,trap,teen,unprecedented,pandemic,capture,process,isis,manage,water
4,beauty,white,wake,write,little,chip,internet,watch,mad,care,strike,sage,rugoff,faustin
5,farmer,creative,bond,friendship,shimu,track,seahorse,rein,dog,military,navigate,nominate,marconi,xavier
6,revolutionary,today,soviet,high,german,hand,domestic,planet,unfinished,odd,holiday,candidate,cheka,xavi
7,teenager,offer,curve,list,violence,weed,memory,bee,original,special,succeed,immigrant,business,focus
8,pair,system,law,sometimes,talent,join,luc,deep,fiction,moment,arrival,clarence,inner,much
9,tape,raise,control,mind,factory,wedding,avoid,whale,word,test,bloom,examine,ultimately,remain


In [9]:
top_distinct_words_per_topic_df.to_csv("lda_model_outputs/Plots_top10_words_per_LDA_topic_03_plots.csv",index=False)

Get Top Topics For Each Review:

In [10]:
all_topics = lda_model.get_document_topics(lda_model[corpus], minimum_probability=0.0)
all_topics_csr = gensim.matutils.corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.columns = [f'Plot_Topic #{x}' for x in np.arange(1,len(top_distinct_words_per_topic)+1)]

In [11]:
all_topics_df.sample(5)

Unnamed: 0,Plot_Topic #1,Plot_Topic #2,Plot_Topic #3,Plot_Topic #4,Plot_Topic #5,Plot_Topic #6,Plot_Topic #7,Plot_Topic #8,Plot_Topic #9,Plot_Topic #10,Plot_Topic #11,Plot_Topic #12,Plot_Topic #13,Plot_Topic #14
719,0.08308,0.063936,0.54624,0.044824,0.03903,0.034566,0.031017,0.02813,0.025734,0.023714,0.021989,0.020497,0.019195,0.018048
1042,0.080285,0.063345,0.052269,0.044489,0.038741,0.034309,0.030787,0.027921,0.025543,0.523172,0.021826,0.020345,0.019053,0.017915
45,0.575935,0.065028,0.052833,0.044754,0.03896,0.034504,0.030962,0.028079,0.025688,0.023672,0.021949,0.02046,0.019161,0.018016
221,0.576066,0.065006,0.052775,0.044745,0.038954,0.034498,0.030957,0.028075,0.025684,0.023668,0.021946,0.020457,0.019157,0.018013
396,0.081336,0.063499,0.549902,0.044606,0.038842,0.034399,0.030868,0.027994,0.02561,0.0236,0.021883,0.020398,0.019102,0.017961


In [12]:
# Join NYT Reviews with Associated Topic Relatedness Values
omdb_df.join(all_topics_df,how="left").to_csv("lda_model_outputs/Plots_w_TopicRelatedness_03_plots.csv",index=False, encoding='utf-8-sig')

In [13]:
# Assign Topic Based On Topic with Highest Probability for Each Review:
all_topics_df['Topic_Assignment'] = all_topics_df[list(all_topics_df.columns[:])].idxmax(axis=1)
omdb_df.join(all_topics_df[['Topic_Assignment']],how="left").to_csv("lda_model_outputs/Plots_w_TopicAssignment_03_plots.csv",index=False, encoding='utf-8-sig')