In [117]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from math import isnan

import pickle

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from collections import ChainMap

import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel, LdaMulticore

import pyLDAvis
import pyLDAvis.gensim_models


def compute_coherence_values(dictionary, corpus, word_list, limit, start, step):

    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        
        lda_model = LdaMulticore(corpus = corpus, num_topics = num_topics, id2word = dictionary, workers = 8,
                             chunksize = 1000, iterations = 50,
                             random_state = 20202020, passes = 50, alpha = 'symmetric', eval_every = 10) 
        model_list.append(lda_model)
        
        coherencemodel = CoherenceModel(model = lda_model, texts = word_list, dictionary = dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


def plot_coherence(x, y):
    
    fig = px.line(x = x, y = y, title = 'Coherence scores', width = 750, height = 350,
                  labels = {'x' : '# of topics', 'y' : 'Coherence'})
    fig.update_traces(mode='markers+lines')
    fig.show()
    
    
def format_topics_sentences(ldamodel, corpus, texts):

    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


def assign_documents_to_topics(model, corpus):
    
    topic_lists = {}
    for i_num, i in enumerate(model[corpus]):
        
        character = i_num
        #character = top_characters[i_num]
        
        x = [{topic_number : percentage} for topic_number, percentage in i]
        topic_lists[character] = dict(ChainMap(*x))
        
    topics = pd.DataFrame(topic_lists.keys(), topic_lists.values()).reset_index().rename(columns = {'index' : 'topic_pct', 0 : 'doc_num'})

    topics_normalized = pd.json_normalize(topics['topic_pct'])
    topics_normalized.columns = ['topic_' + str(i) for i in topics_normalized.columns]
    #topics_normalized.fillna(0, inplace = True)

    topics.drop('topic_pct', 1, inplace = True)

    topics_normalized.reset_index(inplace = True, drop = True)
    topics.reset_index(inplace = True, drop = True)

    return pd.concat([topics, topics_normalized],axis = 1)


def plot_topic_ratios(data, row_spacing = .05, col_spacing = .1, width = 950, col_wrap = 4):
    
    fig = px.bar(data, 'Dominant_Topic', 'Topic_Ratio', facet_col = 'speaker', 
             facet_col_wrap = col_wrap, facet_row_spacing=row_spacing, facet_col_spacing=col_spacing,
              title = 'Distribution of topics among top characters',
              labels = {'Topic_Ratio' : '', 'speaker' : 'Speaker'}, width = width, height = 950)
    fig.update_xaxes(showticklabels = True, matches = None, tickfont = {'size' : 10})
    fig.update_yaxes(showticklabels = True, matches = None, tickfont = {'size' : 10})
    fig.for_each_annotation(lambda a: a.update(text = a.text.split('=')[-1]))
    fig.update_layout(showlegend = False)
    fig.show()

Will run topic modeling on the top 23 characters (by line count) of The Office. Idea: find similarly talking people

In [85]:
data = pd.read_csv('../../data/transcripts_cleaned.csv')
print(data.shape)

# reduce scope of analysis to top 35 characters
top_lines = data['speaker'].value_counts().head(23)
top_characters = top_lines.index

print('\nTop 35 characters of the show are:\n', top_characters.tolist())

data = data[data['speaker'].isin(top_characters)]
data = data[['speaker', 'cleaned_sw_rem_lem']]
data = data[data['cleaned_sw_rem_lem'].notnull()]
print('\n', data.shape)

# names and frequent fill words
drop_words = [i.lower() for i in top_characters]
drop_words.extend(['well', 'right', 'good', 'thing', 'nellie', 'bertram', 'robert', 'california', 'yeah', 'say', 'really', 'need', 'guy', 'something'])

data['cleaned_sw_rem_lem'] = data['cleaned_sw_rem_lem'].apply(lambda x: ' '.join([word for word in x.split() if word not in drop_words]))

# merge speeches of same speaker into one
#data = data.groupby(['speaker'])['cleaned_sw_rem_lem'].apply(lambda x: ' '.join(x)).reset_index()
print('\n', data.shape)

data.head(3)

(54626, 8)

Top 35 characters of the show are:
 ['Michael', 'Dwight', 'Jim', 'Pam', 'Andy', 'Angela', 'Kevin', 'Erin', 'Oscar', 'Darryl', 'Ryan', 'Phyllis', 'Kelly', 'Toby', 'Jan', 'Stanley', 'Meredith', 'Holly', 'Nellie Bertram', 'David Wallace', 'Gabe', 'Robert California', 'Creed']

 (43286, 2)

 (43286, 2)


Unnamed: 0,speaker,cleaned_sw_rem_lem
0,Michael,quarterly look library
1,Jim,told close
2,Michael,master guidance saying grasshopper


Preprocess for LDA:
- remove words that are of length less than 3
- remove words that just 3 unique characters

In [86]:
data['cleaned_sw_rem_lem'] = data['cleaned_sw_rem_lem'].apply(lambda x: ' '.join([word for word in x.split() if (len(word) >= 3) and (len(set(word)) >= 3)]))

In [87]:
word_list = [i.split() for i in data['cleaned_sw_rem_lem']]

Create dictionaries and corpuses

In [88]:
word_dict = corpora.Dictionary(word_list)
word_corpus = [word_dict.doc2bow(i) for i in word_list]
print(word_dict)

Dictionary(15213 unique tokens: ['library', 'look', 'quarterly', 'close', 'told']...)


Create LDA model by finding optimal number of topics

In [91]:
LIMIT = 7
START = 2
STEP = 1

In [94]:
# model_list, coherence_values = compute_coherence_values(word_dict, word_corpus, word_list, 
#                                                         limit = LIMIT, start = START, step = STEP)

In [97]:
# with open('coherence_list.pkl', 'wb') as cl:   
#     pickle.dump(coherence_values, cl)
    
# with open('lda_model_list.pkl', 'wb') as ldal:   
#     pickle.dump(model_list, ldal)

In [98]:
with open('coherence_list.pkl', 'rb') as cl:
    coherence_values = pickle.load(cl)
    
with open('lda_model_list.pkl', 'rb') as ldal:
    model_list = pickle.load(ldal)

In [99]:
x = [i for i in range(START, LIMIT, STEP)]
plot_coherence(x, coherence_values)

Optimal topic number is 4

In [100]:
lda_model = model_list[4]

In [101]:
lda_model.show_topics()

[(0,
  '0.031*"great" + 0.026*"sorry" + 0.026*"god" + 0.020*"work" + 0.012*"said" + 0.012*"new" + 0.009*"dunder" + 0.009*"mifflin" + 0.008*"question" + 0.007*"still"'),
 (1,
  '0.017*"love" + 0.014*"year" + 0.010*"people" + 0.010*"time" + 0.009*"paper" + 0.009*"never" + 0.009*"three" + 0.009*"job" + 0.008*"hello" + 0.007*"ever"'),
 (2,
  '0.023*"mean" + 0.019*"wait" + 0.019*"time" + 0.018*"alright" + 0.016*"call" + 0.014*"fine" + 0.013*"thanks" + 0.012*"help" + 0.011*"cool" + 0.010*"idea"'),
 (3,
  '0.020*"please" + 0.018*"talk" + 0.013*"friend" + 0.012*"name" + 0.011*"manager" + 0.010*"whoa" + 0.010*"room" + 0.009*"everybody" + 0.009*"anything" + 0.009*"second"'),
 (4,
  '0.028*"look" + 0.017*"thought" + 0.017*"stop" + 0.015*"tell" + 0.014*"day" + 0.009*"even" + 0.009*"night" + 0.008*"car" + 0.008*"happy" + 0.008*"everything"'),
 (5,
  '0.034*"thank" + 0.020*"sure" + 0.017*"nice" + 0.013*"baby" + 0.012*"feel" + 0.011*"maybe" + 0.010*"much" + 0.008*"hot" + 0.008*"party" + 0.008*"real"'

In [102]:
lda_model.log_perplexity(word_corpus)

-8.407778764997198

#### Visualize topics

In [103]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, word_corpus, word_dict)
vis

Check words and which topic they belong to

In [105]:
df_topic_sents_keywords = format_topics_sentences(ldamodel = lda_model, corpus = word_corpus, texts = word_list)

In [107]:
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

df_dominant_topic = pd.concat([df_dominant_topic, data['speaker'].reset_index(drop = True)], 1)
df_dominant_topic['Dominant_Topic'] = df_dominant_topic['Dominant_Topic'].astype(int)
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,speaker
0,0,4,0.7903,"look, thought, stop, tell, day, even, night, c...","[quarterly, look, library]",Michael
1,1,4,0.3889,"look, thought, stop, tell, day, even, night, c...","[told, close]",Jim
2,2,5,0.6196,"thank, sure, nice, baby, feel, maybe, much, ho...","[master, guidance, saying, grasshopper]",Michael
3,3,0,0.72,"great, sorry, god, work, said, new, dunder, mi...","[actually, called]",Jim
4,4,4,0.722,"look, thought, stop, tell, day, even, night, c...","[show, done]",Michael


In [108]:
by_speaker_topic_dist = df_dominant_topic.groupby(['speaker', 'Dominant_Topic'])['Document_No'].count().reset_index()

line_counts = by_speaker_topic_dist.groupby('speaker')['Document_No'].sum()
by_speaker_topic_dist['sum'] = by_speaker_topic_dist['speaker'].map(line_counts)
by_speaker_topic_dist['Topic_Ratio'] = by_speaker_topic_dist['Document_No'] / by_speaker_topic_dist['sum']
by_speaker_topic_dist.drop(['Document_No', 'sum'], 1, inplace = True)

In [116]:
plot_topic_ratios(by_speaker_topic_dist)

In [124]:
# by_speaker_topic_dist.pivot_table(values = 'Topic_Ratio', index='speaker', columns='Dominant_Topic').T.corr()

In [113]:
# assign_documents_to_topics(lda_model, word_corpus)