In [403]:
import pandas as pd
import numpy as np
from math import isnan

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from afinn import Afinn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import tensorflow as tf
import tensorflow_hub as hub

from sklearn.decomposition import PCA

def vader_sentiment_scores(x):
    scores = vader_sentiment.polarity_scores(x)
    return scores['neg'], scores['neu'], scores['pos'], scores['compound']

def plot_episode_vectors(data = embed_data):
    
    fig = px.scatter(data, x = 'vector_dim_1', y = 'vector_dim_2', color = 'season', color_continuous_scale='solar',
                 hover_data = ['season', 'episode', 'title'],
                     title = 'Episode similarity based on transcript embedding vectors', 
                     labels = {'season' : 'Season', 'episode' : 'Episode #', 
                               'vector_dim_1' : '1st principal component', 'vector_dim_2' : '2nd principal component'},
                     width = 750, height = 450)
    fig.update_traces(marker={'size' : 10, 'opacity' : 1, 'line' : {'width' : 1, 'color' : 'black'}})
    fig.show()
    
def plot_character_vectors(data = embed_data_char):
    
    fig = px.scatter(data, x = 'vector_dim_1', y = 'vector_dim_2', color_continuous_scale='solar',
                 hover_data = ['speaker'], 
                     title = 'Similarly talking characters', 
                     labels = {'season' : 'Season', 'speaker' : 'Speaker', 
                               'vector_dim_1' : '1st principal component', 'vector_dim_2' : '2nd principal component'},
                     width = 750, height = 450)
    fig.update_traces(marker={'size' : 12, 'opacity' : 1, 'line' : {'width' : 1, 'color' : 'black'}})
    fig.update_layout(showlegend=False)
    fig.show()

In [5]:
data = pd.read_csv('../../data/transcripts_cleaned.csv')
print(data.shape)

# reduce scope of analysis to top 23 characters
top_lines = data['speaker'].value_counts().head(23)
top_characters = top_lines.index
data = data[data['speaker'].isin(top_characters)]
print(data.shape)

data['speaking_to'] = data.groupby('scene')['speaker'].shift(-1)

data = data.merge(data.groupby('scene')['speaker'].nth(-2), left_on = 'scene', right_index = True, how = 'left').rename(columns = {'speaker_x' : 'speaker', 'speaker_y' : 'before_last_to_talk'})
data['speaking_to'].fillna(data['before_last_to_talk'], inplace = True)
data.drop('before_last_to_talk', 1, inplace = True)
data.loc[data['speaker'] == data['speaking_to'], 'speaking_to'] = np.nan
print(data.shape)

data.head(3)

(54626, 8)
(48416, 8)
(48416, 9)


Unnamed: 0,season,episode,title,scene,speaker,line,cleaned,cleaned_sw_rem,speaking_to
0,1,1,Pilot,1,Michael,All right Jim. Your quarterlies look very good...,all right jim your quarterlies look very good ...,right jim quarterlies look good things library,Jim
1,1,1,Pilot,1,Jim,"Oh, I told you. I couldn't close it. So...",oh i told you i could not close it so,told close,Michael
2,1,1,Pilot,1,Michael,So you've come to the master for guidance? Is ...,so you have come to the master for guidance is...,master guidance saying grasshopper,Jim


### Sentiment on sentence level

1. AFINN for scores: sums up AFINN scores of words to get sentence level sentiment - range is not constained
2. polarity: puts sentence score between -1 and 1

#### AFINN

In [171]:
sentiment_data = data[['season', 'speaker', 'speaking_to', 'cleaned_sw_rem']].copy()
sentiment_data = sentiment_data[sentiment_data['cleaned_sw_rem'].notnull()]

In [172]:
afinn = Afinn()

In [173]:
sentiment_data['afinn_score'] = sentiment_data['cleaned_sw_rem'].apply(lambda x: afinn.score(str(x)) if x is not None else 0)

In [174]:
afinn_by_people = sentiment_data.groupby(['speaker'])['afinn_score'].mean().sort_values()
afinn_by_people_by_season = sentiment_data.groupby(['speaker', 'season'])['afinn_score'].mean().reset_index()
afinn_by_people_to_people = sentiment_data.groupby(['speaker', 'speaking_to'])['afinn_score'].mean().reset_index().sort_values(['speaker', 'afinn_score'], ascending = [True, False])

#### Polarity

In [175]:
vader_sentiment = SentimentIntensityAnalyzer()

In [176]:
vader_sentiment.polarity_scores('not a great way to love someone, but an awesome way')

{'neg': 0.123, 'neu': 0.402, 'pos': 0.474, 'compound': 0.7966}

In [177]:
sentiment_data[['neg', 'neu', 'pos', 'compound_score']] = sentiment_data.apply(lambda x: vader_sentiment_scores(str(x['cleaned_sw_rem'])), result_type ='expand', axis = 1)

In [198]:
polarity_by_people = sentiment_data.groupby(['speaker'])[['neg', 'pos', 'compound_score']].mean().sort_values('neg')

polarity_by_people_by_season_compound = sentiment_data.groupby(['speaker', 'season'])['compound_score'].mean().reset_index()
polarity_by_people_by_season_neg = sentiment_data.groupby(['speaker', 'season'])['neg'].mean().reset_index()
polarity_by_people_by_season_pos = sentiment_data.groupby(['speaker', 'season'])['pos'].mean().reset_index()

polarity_by_people_to_people = sentiment_data.groupby(['speaker', 'speaking_to'])['compound_score'].mean().reset_index().sort_values(['speaker', 'compound_score'], ascending = [True, False])

### Sentence embeddings on episode transcripts --> find most similar episodes

In [303]:
embed_data = data[['season', 'episode', 'title', 'cleaned_sw_rem']].copy()
embed_data = embed_data[embed_data['cleaned_sw_rem'].notnull()]

In [304]:
embed_data = embed_data.groupby(['season','episode', 'title'])['cleaned_sw_rem'].apply(lambda x: ' '.join(x)).reset_index()
embed_data.rename(columns = {'cleaned_sw_rem' : 'episode_transcript'}, inplace = True)

Get Universal Sentence Encoder from Google

In [244]:
module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = hub.load(module_url)

Encode episode transcripts as vectors

In [305]:
sentence_vectors = model(embed_data['episode_transcript'].values)
sentence_vectors_df = pd.DataFrame(np.array(sentence_vectors))

In [306]:
print(sentence_vectors_df.shape)
sentence_vectors_df.head(3)

(186, 512)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.045041,-0.045113,0.042466,-0.045113,-0.045113,-0.045113,0.045113,-0.045113,-0.045113,0.045113,...,-0.045113,-0.045113,-0.045113,0.045051,0.045113,0.045029,0.045113,-0.045113,-0.045113,0.045113
1,0.044479,-0.045271,-0.04527,-0.04523,-0.045271,-0.045271,0.044993,-0.045271,-0.045271,0.045271,...,-0.045271,-0.045271,-0.045271,0.033932,0.045271,0.045271,0.045271,-0.045271,-0.045271,0.0452
2,0.044651,-0.045381,0.039798,-0.04538,-0.045381,-0.045381,0.045381,-0.045381,-0.045381,0.045381,...,-0.045381,-0.045381,-0.045381,-0.045381,0.045381,0.04538,0.045381,-0.045381,-0.045381,0.045381


All episodes have been turned into 512 length vectors

#### Dimensionality reduction for visualization

In [341]:
pca = PCA(n_components = 2, random_state = 20202020)
sentence_vectors_reduced = pca.fit_transform(sentence_vectors_df)

In [342]:
embed_data[['vector_dim_1', 'vector_dim_2']] = sentence_vectors_reduced

In [404]:
plot_episode_vectors()

### Sentence embeddings on character's lines --> find similarly talking people

This can also be approached by topic modeling (LDA)

In [405]:
embed_data_char = data[['season', 'speaker', 'cleaned_sw_rem']].copy()
embed_data_char = embed_data_char[embed_data_char['cleaned_sw_rem'].notnull()]

embed_data_char = embed_data_char.groupby(['speaker'])['cleaned_sw_rem'].apply(lambda x: ' '.join(x)).reset_index()
embed_data_char.rename(columns = {'cleaned_sw_rem' : 'speech'}, inplace = True)

sentence_vectors_char = model(embed_data_char['speech'].values)
sentence_vectors_char_df = pd.DataFrame(np.array(sentence_vectors_char))

pca = PCA(n_components = 2, random_state = 42)
sentence_vectors_char_reduced = pca.fit_transform(sentence_vectors_char_df)

embed_data_char[['vector_dim_1', 'vector_dim_2']] = sentence_vectors_char_reduced

In [406]:
plot_character_vectors()

### LDA topic modeling