In [238]:
major_characters = {
    'sokka', 'aang', 'katara', 'zuko', 'toph', 'iroh', 'zuko', 'azula', # 1000+ non-stopwords / 3000+ total words
    'zhao', 'jet', 'suki', 'hakoda', 'bumi',                            # 400+ non-stopwords / 1000+ total words
}
minor_characters = {
    'hama', 'ozai', 'guru pathik', 'mai', 'roku', 'long feng',          # 350+ non-stopwords
    'the mechanist', 'piandao', 'warden', 'bato', 'ty lee',
    'jeong jeong', 'pakku', 'earth king', # 'zhang leader'
    'professor zei', 'joo dee', 'chong', 'yue',                         # 200+ non-stopwords
    'appa', 'momo'                                                      # :)
}
aliases = {
    'fire lord ozai': 'ozai', 'ruko': 'roku', 'princess yue': 'yue', 'princess yue / moon spirit': 'yue', 'monk gyatso': 'gyatso',
    'themechanist': 'the mechanist', 'zuko/blue spirit': 'zuko', 'king bumi': 'bumi', 'master pakku': 'pakku'
}
multiword_words = {
    'jeong jeong': 'jeong_jeong', 'long feng': 'long_feng', 'ty lee': 'ty_lee', 'joo dee': 'joo_dee', 'wan shi tong': 'wan_shi_tong', 
    'twinkle toes': 'twinkletoes', 'twinkle-toes': 'twinkletoes'
}

In [239]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from functools import reduce
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        line = line.lower()
        # Get all non-stop words
        stop_words = set(w for w in stopwords.words('english') 
            + ['us', 'get', 'like', 'thats', 'go', 'going', 'cant', 'yeh', 'oh', 'got', 'hey', 
               'yeah', 'uh', 'whats', 'could', 'shall', 'gonna', 'okay', 'one'] 
        )
        # Make a special case to treat multi-words as one word
        words = [w.replace('_', ' ') for w in
            re.findall(r'\w+', 
                reduce(lambda acc, k: acc.replace(k, multiword_words[k]), multiword_words.keys(), line)
            )]
        bow[speaker]['total_count'] += len(words)
        words = [w for w in words if w not in stop_words]
        bow[speaker].update(words)
    return bow

In [240]:
def get_ep_total_word_count(script_lines):
    return sum(len(re.findall(r'\w+', line.split('::')[1])) for line in script_lines)

In [241]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)


In [242]:
# Note: this is a bad metric -- it just finds words that this character said once and nobody else ever said.
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Compute the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [253]:
from math import log
# Since we have very few characters, we want to weight idf highly (with a lower base)
base = 1.2

# I experimented with these parameters but ultimately decided they were more harmful than helpful on such a small dataset
def get_words_by_tfidf(character_bows, character_set=None, sublinear_tf=False, filter_df_1=False):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, s), where w is a word 
    and s is the tf-idf score of that word for that character. The lists are sorted 
    by descending score.
    """
    # If no explicit character set is passed, just use all characters
    character_set = character_set or character_bows.keys()
    # Compute the number of characters that said each word
    all_words = set(k for bow in character_bows.values() for k in bow.keys())
    n_chars = len(character_set)
    idf = {
        w: 1 + log(n_chars / (1 + df), base)
        for w in all_words if (df := sum(character_bows[c][w] > 0 for c in character_set)) > (1 if filter_df_1 else 0)
    }
    # tf-idf = tf(w, c) / (1 + log(N / (1 + df(w))))
    words_by_tfidf = {
        c: sorted([
                (w, ((1 + log(v)) if sublinear_tf else v) * idf[w]) 
                for w, v in character_bows[c].items() if not filter_df_1 or w in idf
            ],
            key=lambda ws: ws[1],
            reverse=True
        ) for c in character_set
    }
    return words_by_tfidf

In [254]:
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

def get_character_sentence_sentiments(script_lines):
    """
    Return a mapping from each character to a list of all their sentences along with the
    computed sentiment scores (positive, negative, neutral) for each sentence.
    """
    sid = SentimentIntensityAnalyzer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    char_sentence_sentiments = defaultdict(list)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if speaker not in important_characters | minor_characters:
            continue
        
        char_sentence_sentiments[speaker].extend(
            (sid.polarity_scores(sentence), sentence) for sentence in tokenizer.tokenize(line)
        )

    return char_sentence_sentiments

In [255]:
def merge_character_sentence_sentiments(target_map, added_map):
    """
    Given two maps from characters to lists of sentence sentiments, combine all lists with the same key.
    The first argument will be modified.
    """
    for char in added_map.keys():
        target_map[char].extend(added_map[char])


In [256]:
def get_characters_and_sentiment_frequencies(char_sentence_sentiments):
    """
    Given a map from characters to lists of sentence sentiments, return a map
    from characters to lists of the frequency of positive, negative, and neutral
    sentiments.
    """
    sentiment_freqs = defaultdict(lambda: defaultdict(int))
    for char, sentiments in char_sentence_sentiments.items():
        for (scores, _) in sentiments:
            sentiment_freqs[char][
                'pos' if scores['compound'] > 0.05 else 'neg' if scores['compound'] < -0.05 else 'neu'
            ] += 1/len(sentiments)
    return sentiment_freqs

In [257]:
season_bows_collection = defaultdict(list)
season_bows = {}
season_tfidf = {}
season_tfidf_minor = {}
season_bayes = {}
total_word_count = {}
char_sentence_sentiments = defaultdict(list)
for season in range(1, 4):
    for episode in range(1, 21):
        ep_num = f'{season}{str(episode).zfill(2)}'
        with open(f'transcripts/{ep_num}.txt') as f:
            script_lines = f.readlines()
            season_bows_collection[season].append(get_ep_character_bows(script_lines))
            merge_character_sentence_sentiments(char_sentence_sentiments, get_character_sentence_sentiments(script_lines))
            total_word_count[ep_num] = get_ep_total_word_count(script_lines)
    season_bows[season] = combine_character_bows(season_bows_collection[season])
    # It helps to compute tf-idf scores for major characters only (excluding minor characters) or
    # else depth-of-character words like "know", "want", "think" are weighted too highly
    season_tfidf[season] = get_words_by_tfidf(season_bows[season], character_set=major_characters)
    season_tfidf_minor[season] = get_words_by_tfidf(season_bows[season], character_set=major_characters | minor_characters)
    season_bayes[season] = get_words_by_bayes(season_bows[season])
season_bows['all'] = combine_character_bows([bows for s in range(1, 4) for bows in season_bows_collection[s]])
season_tfidf['all'] = get_words_by_tfidf(season_bows['all'], character_set=major_characters)
season_tfidf_minor['all'] = get_words_by_tfidf(season_bows['all'], character_set=major_characters | minor_characters)
season_bayes['all'] = get_words_by_bayes(season_bows['all'])

4.80178401692393
7.379666948106317
2.577882931182385
6.379666948106316
2.577882931182385
4.80178401692393
2.0
3.0551242323191636


In [281]:
season_tfidf['all']['katara'][1:15]

[('aang', 319.08412104554384),
 ('gran', 240.89990494774008),
 ('sokka', 206.0),
 ('toph', 150.84638444002218),
 ('waterbending', 81.63032828770682),
 ('appa', 72.18072207310678),
 ('nini', 64.96481471753644),
 ('mum', 64.96481471753644),
 ('haru', 63.81962418616323),
 ('momo', 63.30073702403216),
 ('mom', 59.34444096003015),
 ('pole', 59.34444096003015),
 ('mother', 59.34444096003015),
 ('earthbending', 59.34444096003015)]

In [262]:
season_bows['all']['zuko']['firebending']

17

In [278]:
sorted(idf.items(), key=lambda x: x[1])[::20]

[('little', 0.5609805604940379),
 ('nation', 0.5609805604940379),
 ('great', 1.0),
 ('free', 1.477241301136777),
 ('better', 1.477241301136777),
 ('anything', 1.477241301136777),
 ('call', 2.0),
 ('girl', 2.0),
 ('everything', 2.0),
 ('mission', 2.577882931182385),
 ('sea', 2.577882931182385),
 ('taking', 2.577882931182385),
 ('comet', 2.577882931182385),
 ('food', 3.223901085741545),
 ('idea', 3.223901085741545),
 ('happy', 3.223901085741545),
 ('move', 3.223901085741545),
 ('kyoshi', 3.223901085741545),
 ('rock', 3.95629606400201),
 ('technique', 3.95629606400201),
 ('middle', 3.95629606400201),
 ('perfect', 3.95629606400201),
 ('iroh', 3.95629606400201),
 ('decided', 3.95629606400201),
 ('fast', 3.95629606400201),
 ('comes', 3.95629606400201),
 ('figure', 4.80178401692393),
 ('tried', 4.80178401692393),
 ('dark', 4.80178401692393),
 ('sight', 4.80178401692393),
 ('stick', 4.80178401692393),
 ('busy', 4.80178401692393),
 ('taught', 4.80178401692393),
 ('shut', 4.80178401692393),
 ('b

In [265]:
character_set = major_characters
character_bows = season_bows['all']
all_words = set(k for bow in character_bows.values() for k in bow.keys())
n_chars = len(character_set)
idf = {
    w: 1 + log(n_chars / (1 + df), base)
    for w in all_words if (df := sum(character_bows[c][w] > 0 for c in character_set)) > 0
}

In [119]:
sum(total_word_count.values())

120138

In [121]:
season_bows['all']['sokka']['weirdness']

1

In [122]:
season_bayes['all']['sokka'][:15]

[('scare', 1.0),
 ('cookin', 1.0),
 ('weirdness', 1.0),
 ('steering', 1.0),
 ('waterbended', 1.0),
 ('screw', 1.0),
 ('freakish', 1.0),
 ('ewww', 1.0),
 ('aahh', 1.0),
 ('bolt', 1.0),
 ('sneezed', 1.0),
 ('beams', 1.0),
 ('snot', 1.0),
 ('uuuuugh', 1.0),
 ('stab', 1.0)]

In [123]:
char_sentence_sentiments['ty lee']

[({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, ' Azula!'),
 ({'neg': 0.0, 'neu': 0.598, 'pos': 0.402, 'compound': 0.6166},
  ' It is so good to see you!'),
 ({'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'compound': 0.296}, ' Oh yeah.'),
 ({'neg': 0.0, 'neu': 0.445, 'pos': 0.555, 'compound': 0.5777},
  'He was so funny.'),
 ({'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.6369},
  ' Oh...I...uh...would love to.'),
 ({'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.7425},
  "But the truth is, I'm really happy here."),
 ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  'I mean, my aura has never been pinker!'),
 ({'neg': 0.0, 'neu': 0.444, 'pos': 0.556, 'compound': 0.3612},
  ' Thank you, Azula.'),
 ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  ' Uh...yeah...sure...uh...of course...'),
 ({'neg': 0.347, 'neu': 0.653, 'pos': 0.0, 'compound': -0.5023},
  " I'm sorry Azula, but unfortunately there won't be a show tomorrow."),
 ({'neg': 0.0, 'neu': 0.784,

In [124]:
get_characters_and_sentiment_frequencies(char_sentence_sentiments)

defaultdict(<function __main__.get_characters_and_sentiment_frequencies.<locals>.<lambda>()>,
            {'sokka': defaultdict(int,
                         {'neu': 0.5657534246575499,
                          'neg': 0.20410958904109833,
                          'pos': 0.23013698630137325}),
             'katara': defaultdict(int,
                         {'neu': 0.5485395189003418,
                          'neg': 0.19802405498281725,
                          'pos': 0.2534364261168377}),
             'zuko': defaultdict(int,
                         {'neu': 0.5496872828352996,
                          'pos': 0.24252953439888933,
                          'neg': 0.20778318276581034}),
             'iroh': defaultdict(int,
                         {'neu': 0.43678160919540476,
                          'pos': 0.37356321839080653,
                          'neg': 0.1896551724137935}),
             'aang': defaultdict(int,
                         {'neu': 0.573233959818546,
          

In [125]:
season_tfidf['all']['suki'][1:15]

[('sokka', 53.33576813857113),
 ('unagi', 51.94751862035726),
 ('warrior', 47.944778895405705),
 ('gondola', 41.91093093472921),
 ('safely', 40.21778375416927),
 ('sorry', 39.308849623729316),
 ('warden', 39.30194181625914),
 ('tickets', 38.786248210482114),
 ('mean', 37.49031541235173),
 ('guys', 37.203329527591016),
 ('big', 35.61735676579326),
 ('fly', 35.473951045376594),
 ('huh', 34.893509477357604),
 ('help', 34.79479532095062)]

In [126]:
# Sort by total word count
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)[:40]

[('sokka', 19024),
 ('aang', 18846),
 ('katara', 15465),
 ('zuko', 9693),
 ('toph', 5675),
 ('iroh', 5437),
 ('azula', 3799),
 ('zhao', 1678),
 ('jet', 1444),
 ('suki', 1278),
 ('hakoda', 1105),
 ('bumi', 1045),
 ('ozai', 952),
 ('hama', 936),
 ('guru pathik', 862),
 ('mai', 844),
 ('roku', 806),
 ('long feng', 776),
 ('warden', 750),
 ('bato', 696),
 ('piandao', 689),
 ('the mechanist', 682),
 ('ty lee', 675),
 ('pakku', 587),
 ('yue', 578),
 ('jeong jeong', 562),
 ('chong', 533),
 ('earth king', 520),
 ('joo dee', 489),
 ('zhang leader', 469),
 ('old man', 456),
 ('dock', 454),
 ('professor zei', 439),
 ('teo', 392),
 ('chief arnook', 372),
 ('general fong', 370),
 ('aunt wu', 358),
 ('guard', 358),
 ('gan jin leader', 356),
 ('sun warrior chief', 344)]

In [127]:
# Sort by total non-stopword word count
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := sum(season_bows['all'][char].values()) - season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)[:40]

[('sokka', 7787),
 ('aang', 7445),
 ('katara', 6061),
 ('zuko', 3650),
 ('iroh', 2355),
 ('toph', 2199),
 ('azula', 1552),
 ('zhao', 729),
 ('jet', 568),
 ('suki', 482),
 ('hakoda', 473),
 ('bumi', 464),
 ('ozai', 390),
 ('hama', 387),
 ('guru pathik', 381),
 ('mai', 351),
 ('roku', 350),
 ('long feng', 345),
 ('the mechanist', 327),
 ('piandao', 305),
 ('warden', 300),
 ('bato', 279),
 ('ty lee', 272),
 ('pakku', 265),
 ('jeong jeong', 250),
 ('zhang leader', 231),
 ('earth king', 229),
 ('chong', 216),
 ('joo dee', 212),
 ('professor zei', 211),
 ('yue', 205),
 ('old man', 189),
 ('dock', 187),
 ('chief arnook', 178),
 ('gan jin leader', 169),
 ('general fong', 160),
 ('shyu', 157),
 ('teo', 155),
 ('aunt wu', 152),
 ('sun warrior chief', 147)]