In [55]:
major_characters = {
    'sokka', 'aang', 'katara', 'zuko', 'toph', 'iroh', 'zuko', 'azula', # 1000+ non-stopwords / 3000+ total words
    'zhao', 'jet', 'suki', 'hakoda', 'bumi',                            # 400+ non-stopwords / 1000+ total words
}
minor_characters = {
    'hama', 'ozai', 'guru pathik', 'mai', 'roku', 'long feng',          # 350+ non-stopwords
    'mechanist', 'piandao', 'warden', 'bato', 'ty lee',
    'jeong jeong', 'pakku', 'earth king', # 'zhang leader'
    'professor zei', 'joo dee', 'chong', 'yue',                         # 200+ non-stopwords
    'appa', 'momo'                                                      # :)
}
major_characters = minor_characters = None
aliases = {
    # 'fire lord ozai': 'ozai', 'ruko': 'roku', 'princess yue': 'yue', 'princess yue / moon spirit': 'yue', 'monk gyatso': 'gyatso',
    # 'themechanist': 'mechanist', 'the mechanist': 'mechanist', 'zuko/blue spirit': 'zuko', 'king bumi': 'bumi', 'master pakku': 'pakku'
}
word_substitutions = {
    # 'jeong jeong': 'jeong_jeong', 'long feng': 'long_feng', 'ty lee': 'ty_lee', 'joo dee': 'joo_dee', 'wan shi tong': 'wan_shi_tong', 
    # 'chit sang': 'chit_sang', 'ba sing se': 'ba_sing_se', 'twinkle toes': 'twinkletoes', 'twinkle-toes': 'twinkletoes', 'mum': 'mom',
    # 'dai li': 'dai_li', 'sozen': 'sozin', 'honour': 'honor'
}

In [58]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from functools import reduce
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        try:
            speaker, line = line.split('::')
        except:
            print(f'line: <<{line}>>')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        line = line.lower()
        # Get all non-stop words
        stop_words = set(w for w in stopwords.words('english') 
            + ['us', 'get', 'like', 'thats', 'go', 'going', 'cant', 'oh', 'got', 'hey', 'would',
               'yeah', 'yeh', 'ya', 'uh', 'whats', 'could', 'shall', 'gonna', 'okay', 'one', 
               'something', 'may'] 
        )
        # Special cases, typically multi-word names treated as one word
        words = [w.replace('_', ' ') for w in
            re.findall(r'\w+', 
                reduce(lambda acc, k: acc.replace(k, word_substitutions[k]), word_substitutions.keys(), line)
            )]
        bow[speaker]['total_count'] += len(words)
        words = [w for w in words if w not in stop_words]
        bow[speaker].update(words)
    return bow

In [31]:
def get_ep_total_word_count(script_lines):
    return sum(len(re.findall(r'\w+', line.split('::')[1])) for line in script_lines)

In [32]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)

In [33]:
# Note: this is a bad metric -- it just finds words that this character said once and nobody else ever said.
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Compute the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [34]:
from math import log
# Since we have very few characters, we want to weight idf highly (with a lower base)
base = 1.2

# I experimented with these parameters but ultimately decided they were more harmful than helpful on such a small dataset
def get_words_by_tfidf(character_bows, character_set=None, sublinear_tf=False, filter_df_1=False):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, s), where w is a word 
    and s is the tf-idf score of that word for that character. The lists are sorted 
    by descending score.
    """
    # If no explicit character set is passed, just use all characters
    character_set = character_set or character_bows.keys()
    # Compute the number of characters that said each word
    all_words = set(k for bow in character_bows.values() for k in bow.keys())
    n_chars = len(character_set)
    idf = {
        w: 1 + log(n_chars / (1 + df), base)
        for w in all_words if (df := sum(character_bows[c][w] > 0 for c in character_set)) > (1 if filter_df_1 else 0)
    }
    # tf-idf = tf(w, c) / (1 + log(N / (1 + df(w))))
    words_by_tfidf = {
        c: sorted([
                (w, ((1 + log(v)) if sublinear_tf else v) * idf[w]) 
                for w, v in character_bows[c].items() 
                if w != 'total_count' and (not filter_df_1 or w in idf)
            ],
            key=lambda ws: ws[1],
            reverse=True
        ) for c in character_set
    }
    return words_by_tfidf

In [35]:
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

def get_character_sentence_sentiments(script_lines, character_set=None):
    """
    Return a mapping from each character to a list of all their sentences along with the
    computed sentiment scores (positive, negative, neutral) for each sentence.
    """
    sid = SentimentIntensityAnalyzer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    char_sentence_sentiments = defaultdict(list)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if character_set and speaker not in character_set:
            continue
        
        char_sentence_sentiments[speaker].extend(
            (sid.polarity_scores(sentence), sentence) for sentence in tokenizer.tokenize(line)
        )

    return char_sentence_sentiments

In [36]:
def merge_character_sentence_sentiments(target_map, added_map):
    """
    Given two maps from characters to lists of sentence sentiments, combine all lists with the same key.
    The first argument will be modified.
    """
    for char in added_map.keys():
        target_map[char].extend(added_map[char])

In [37]:
def get_character_sentiment_frequencies(char_sentence_sentiments):
    """
    Given a map from characters to lists of sentence sentiments, return a map
    from characters to lists of the frequency of positive, negative, and neutral
    sentiments.
    """
    sentiment_freqs = defaultdict(lambda: defaultdict(int))
    for char, sentiments in char_sentence_sentiments.items():
        for (scores, _) in sentiments:
            sentiment_freqs[char][
                'pos' if scores['compound'] > 0.05 else 'neg' if scores['compound'] < -0.05 else 'neu'
            ] += 1/len(sentiments)
    return sentiment_freqs

In [61]:
for ep in listdir('transcripts/'):
    lines =[]
    with open(f'transcripts/{ep}') as f:
        lines = f.readlines()
    lines = [l for l in lines if '::' in l]
    with open(f'transcripts/{ep}', 'w') as f:
        f.writelines(lines)

In [59]:
from os import listdir

season_bows_collection = defaultdict(list)
season_bows = {}
season_tfidf = {}
season_tfidf_minor = {}
season_bayes = {}
total_word_count = {}
character_sentence_sentiments = defaultdict(list)

episodes = [ep.replace('.txt', '') for ep in listdir('transcripts/')]
for season in range(1, 10):
    for episode in (e for e in episodes if f'0{season}x' in e):
        with open(f'transcripts/{episode}.txt') as f:
            script_lines = f.readlines()
            season_bows_collection[season].append(get_ep_character_bows(script_lines))
            merge_character_sentence_sentiments(character_sentence_sentiments, get_character_sentence_sentiments(script_lines))
            total_word_count[episode] = get_ep_total_word_count(script_lines)
    season_bows[season] = combine_character_bows(season_bows_collection[season])
    # It helps to compute tf-idf scores for major characters only (excluding minor characters) or
    # else depth-of-character words like "know", "want", "think" are weighted too highly
    season_tfidf[season] = get_words_by_tfidf(season_bows[season], character_set=major_characters)
    season_tfidf_minor[season] = get_words_by_tfidf(season_bows[season], character_set=None) #major_characters | minor_characters)
    season_bayes[season] = get_words_by_bayes(season_bows[season])
season_bows['all'] = combine_character_bows([bows for s in range(1, 10) for bows in season_bows_collection[s]])
season_tfidf['all'] = get_words_by_tfidf(season_bows['all'], character_set=major_characters)
season_tfidf_minor['all'] = get_words_by_tfidf(season_bows['all'], character_set=major_characters | minor_characters)
season_bayes['all'] = get_words_by_bayes(season_bows['all'])
character_sentiment_frequencies = get_character_sentiment_frequencies(character_sentence_sentiments)

line: <<In memory of Larry Einhorn.
>>


ValueError: not enough values to unpack (expected 2, got 1)

In [46]:
# Sort by total word count
count_sorted = sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)[:30]

KeyError: 'all'

In [50]:
from json import dump

words_json = defaultdict(dict)
count_json = defaultdict(list)
sentiment_json = defaultdict(list)
for c in sorted(major_characters | minor_characters):
    ct = c.title()
    for season in ['all', 1, 2, 3]:
        top_ten = [
            w for (w, _) in (
                season_tfidf[season][c][:10] if c in major_characters else season_tfidf_minor[season][c][:10]
            )
        ]
        words_json[ct][season] = top_ten
    words_json[ct]['word_count'] = season_bows['all'][c]['total_count']
    words_json[ct]['neg_freq'] = character_sentiment_frequencies[c]['neg']
    words_json[ct]['neu_freq'] = character_sentiment_frequencies[c]['neu']
    words_json[ct]['pos_freq'] = character_sentiment_frequencies[c]['pos']
    words_json[ct]['net_freq'] = character_sentiment_frequencies[c]['pos'] - character_sentiment_frequencies[c]['neg']

for (c , wc) in count_sorted[:10]:
    for season in range(1, 4):
        count_json[f'S{season}'].append(season_bows[season][c]['total_count'])
    count_json['characters'].append(c.title())

# Arbitrarily selected list of characters to highlight for sentiment analysis
for c in ['ty lee', 'iroh', 'toph', 'aang', 'azula', 'katara', 'zuko', 'sokka', 'mai', 'long feng']:
    sentiment_json['characters'].append(c.title())
    sentiment_json['pos'].append(character_sentiment_frequencies[c]['pos'])
    sentiment_json['neg'].append(-character_sentiment_frequencies[c]['neg'])

with open('results.json', 'w') as f:
    dump([words_json, count_json, sentiment_json], f)



TypeError: unsupported operand type(s) for |: 'NoneType' and 'NoneType'

In [13]:
# Sort by non-stopword word count
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := sum(season_bows['all'][char].values()) - season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)[:30]

[('sokka', 7666),
 ('aang', 7337),
 ('katara', 5985),
 ('zuko', 3587),
 ('iroh', 2317),
 ('toph', 2164),
 ('azula', 1517),
 ('zhao', 729),
 ('jet', 562),
 ('suki', 480),
 ('hakoda', 464),
 ('bumi', 458),
 ('hama', 383),
 ('guru pathik', 380),
 ('ozai', 380),
 ('roku', 349),
 ('mai', 345),
 ('long feng', 324),
 ('mechanist', 322),
 ('piandao', 303),
 ('warden', 294),
 ('bato', 275),
 ('ty lee', 267),
 ('pakku', 262),
 ('jeong jeong', 247),
 ('zhang leader', 231),
 ('earth king', 217),
 ('chong', 214),
 ('professor zei', 207),
 ('yue', 203)]