In [58]:
import pandas as pd
import numpy as np
from math import isnan

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from collections import ChainMap

import gensim.corpora as corpora
from gensim.models import LdaModel, CoherenceModel


def assign_documents_to_topics(model, corpus):
    
    topic_lists = {}
    for i_num, i in enumerate(model[corpus]):
        character = top_characters[i_num]
        x = [{topic_number : percentage} for topic_number, percentage in i]
        topic_lists[character] = dict(ChainMap(*x))
        
    topics = pd.DataFrame(topic_lists.keys(), topic_lists.values()).reset_index().rename(columns = {'index' : 'topic_pct', 0 : 'speaker'})

    topics_normalized = pd.json_normalize(topics['topic_pct'])
    topics_normalized.columns = ['topic_' + str(i) for i in topics_normalized.columns]
    #topics_normalized.fillna(0, inplace = True)

    topics.drop('topic_pct', 1, inplace = True)

    topics_normalized.reset_index(inplace = True, drop = True)
    topics.reset_index(inplace = True, drop = True)

    return pd.concat([topics, topics_normalized],axis = 1)

def compute_coherence_values(dictionary, corpus, word_list, limit, start, step):

    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        
        lda_model = LdaModel(corpus = corpus, num_topics = num_topics, id2word = dictionary, chunksize = 1000, iterations = 100,
                             random_state = 20202020, passes = 5, alpha = 'auto', eval_every = 1,) 
        model_list.append(lda_model)
        
        coherencemodel = CoherenceModel(model = lda_model, texts = word_list, dictionary = dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


def plot_coherence(x, y):
    
    fig = px.line(x = x, y = y, title = 'Coherence scores', width = 750, height = 350,
                  labels = {'x' : '# of topics', 'y' : 'Coherence'})
    fig.update_traces(mode='markers+lines')
    fig.show()

Will run topic modeling on the top 35 characters (by line count) of The Office. Idea: find similarly talking people

In [2]:
data = pd.read_csv('../../data/transcripts_cleaned.csv')
print(data.shape)

# reduce scope of analysis to top 35 characters
top_lines = data['speaker'].value_counts().head(35)
top_characters = top_lines.index

print('\nTop 35 characters of the show are:\n', top_characters.tolist())

data = data[data['speaker'].isin(top_characters)]
data = data[['speaker', 'cleaned_sw_rem_lem']]
data = data[data['cleaned_sw_rem_lem'].notnull()]
print('\n', data.shape)

# names and frequent fill words
drop_words = [i.lower() for i in top_characters]
drop_words.extend(['well', 'right', 'good', 'thing', 'nellie', 'bertram', 'robert', 'california', 'yeah', 'say', 'really', 'need', 'guy', 'something'])

data['cleaned_sw_rem_lem'] = data['cleaned_sw_rem_lem'].apply(lambda x: ' '.join([word for word in x.split() if word not in drop_words]))

# merge speeches of same speaker into one
data = data.groupby(['speaker'])['cleaned_sw_rem_lem'].apply(lambda x: ' '.join(x)).reset_index()
print('\n', data.shape)

data.head(3)

(54626, 8)

Top 35 characters of the show are:
 ['Michael', 'Dwight', 'Jim', 'Pam', 'Andy', 'Angela', 'Kevin', 'Erin', 'Oscar', 'Darryl', 'Ryan', 'Phyllis', 'Kelly', 'Toby', 'Jan', 'Stanley', 'Meredith', 'Holly', 'Nellie Bertram', 'David Wallace', 'Gabe', 'Robert California', 'Creed', 'Karen', 'DeAngelo', 'Clark', 'Charles', 'Roy', 'Pete', 'Jo Bennett', 'Todd Packer', 'Donna', 'Carol', 'Katy', 'Val']

 (45149, 2)

 (35, 2)


Unnamed: 0,speaker,cleaned_sw_rem_lem
0,Andy,big tuna single pretty hot huh completely craz...
1,Angela,bet probably sure stop inappropriate ridiculou...
2,Carol,bill head condo association ready sign paper g...


Preprocess for LDA:
- remove words that are of length less than 3
- remove words that just 3 unique characters

In [24]:
data['cleaned_sw_rem_lem'] = data['cleaned_sw_rem_lem'].apply(lambda x: ' '.join([word for word in x.split() if (len(word) >= 3) and (len(set(word)) > 3)]))

In [25]:
word_list = [i.split() for i in data['cleaned_sw_rem_lem']]

Create dictionaries and corpuses

In [26]:
word_dict = corpora.Dictionary(word_list)
word_corpus = [word_dict.doc2bow(i) for i in word_list]
print(word_dict)

Dictionary(14269 unique tokens: ['aaron', 'abandon', 'abercrombie', 'ability', 'able']...)


Create LDA model by finding optimal number of topics

In [59]:
LIMIT = 15
START = 2
STEP = 1

model_list, coherence_values = compute_coherence_values(word_dict, word_corpus, word_list, 
                                                        limit = LIMIT, start = START, step = STEP)

In [60]:
x = [i for i in range(START, LIMIT, STEP)]
plot_coherence(x, coherence_values)

Optimal topic number is 4

In [61]:
lda_model = model_list[2]

In [62]:
lda_model.show_topics()

[(0,
  '0.008*"time" + 0.006*"people" + 0.006*"thank" + 0.005*"work" + 0.005*"please" + 0.005*"great" + 0.004*"little" + 0.004*"never" + 0.004*"office" + 0.004*"year"'),
 (1,
  '0.003*"time" + 0.003*"thank" + 0.003*"great" + 0.003*"work" + 0.002*"mean" + 0.002*"people" + 0.002*"sorry" + 0.002*"said" + 0.002*"wait" + 0.002*"please"'),
 (2,
  '0.003*"great" + 0.003*"time" + 0.003*"sorry" + 0.002*"little" + 0.002*"actually" + 0.002*"thanks" + 0.002*"love" + 0.002*"mean" + 0.002*"company" + 0.002*"please"'),
 (3,
  '0.008*"time" + 0.007*"mean" + 0.007*"great" + 0.006*"little" + 0.006*"sorry" + 0.006*"people" + 0.006*"thank" + 0.006*"love" + 0.005*"work" + 0.005*"maybe"')]

Check words and which topic they belong to

In [63]:
assign_documents_to_topics(lda_model, word_corpus)

Unnamed: 0,speaker,topic_3,topic_0,topic_1,topic_2
0,Michael,0.571336,0.428596,,
1,Dwight,0.999557,,,
2,Jim,0.970106,,0.020755,
3,Pam,0.047509,0.950343,,
4,Andy,0.997449,,,
5,Angela,0.998652,,,
6,Kevin,0.107956,0.891766,,
7,Erin,0.862126,0.018197,,0.119353
8,Oscar,,0.998,,
9,Darryl,0.742918,,0.243983,
