In [99]:
important_characters = {
    'sokka', 'aang', 'katara', 'zuko', 'toph', 'iroh', 'zuko', 'azula', # 1000+ non-stopwords
    'zhao', 'jet', 'suki', 'hakoda', 'bumi',                            # 400+ non-stopwords
}
minor_characters = {
    'hama', 'ozai', 'guru pathik', 'mai', 'roku', 'long feng',          # 350+ non-stopwords
    'the mechanist', 'piandao', 'warden', 'bato', 'ty lee',
    'jeong jeong', 'pakku', 'zhang leader', 'earth king',
    'professor zei', 'joo dee', 'chong', 'yue',                         # 200+ non-stopwords
    'appa', 'momo'                                                      # :)
}
aliases = {
    'fire lord ozai': 'ozai', 'ruko': 'roku', 'princess yue': 'yue', 'princess yue / moon spirit': 'yue', 'monk gyatso': 'gyatso',
    'themechanist': 'the mechanist', 'zuko/blue spirit': 'zuko', 'king bumi': 'bumi', 'master pakku': 'pakku'
}

In [100]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        # if speaker not in important_characters:
        #     continue
        line = line.lower()
        # Get all non-stop words
        stop_words = set(w for w in stopwords.words('english') 
            + ['us', 'get', 'like', 'thats', 'go', 'going', 'cant', 'yeh', 'oh', 'got', 'hey', 
               'yeah', 'uh', 'whats', 'could', 'shall', 'gonna', 'okay', 'one'] 
        )
        words = re.findall(r'\w+', line)
        bow[speaker]['total_count'] += len(words)
        words = [w for w in words if w not in stop_words]
        bow[speaker].update(words)
    return bow

In [101]:
def get_ep_total_word_count(script_lines):
    return sum(len(re.findall(r'\w+', line.split('::')[1])) for line in script_lines)

In [102]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)


In [103]:
# Note: this is a bad metric -- it just finds words that this character said once and nobody else ever said.
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Compute the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [104]:
from math import log
# Since we have very few characters, we want to weight idf highly (with a lower base)
base = 1.2

def get_words_by_tfidf(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, s), where w is a word 
    and s is the tf-idf score of that word for that character. The lists are sorted 
    by descending score.
    """
    # Compute the number of characters that said each word
    all_words = set(k for bow in character_bows.values() for k in bow.keys())
    n_chars = len(character_bows.keys())
    idf = {
        w: 1 + log(n_chars / (1 + sum(character_bows[c][w] > 0 for c in character_bows.keys())), base)
        for w in all_words
    }
    # tf-idf = tf(w, c) / (1 + log(N / (1 + df(w))))
    words_by_tfidf = {
        c: sorted(
            [(w, (1 + v) * idf[w]) for w, v in character_bows[c].items()],
            key=lambda ws: ws[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_tfidf

In [105]:
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

def get_character_sentence_sentiments(script_lines):
    """
    Return a mapping from each character to a list of all their sentences along with the
    computed sentiment scores (positive, negative, neutral) for each sentence.
    """
    sid = SentimentIntensityAnalyzer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    char_sentence_sentiments = defaultdict(list)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if speaker not in important_characters | minor_characters:
            continue
        
        char_sentence_sentiments[speaker].extend(
            (sid.polarity_scores(sentence), sentence) for sentence in tokenizer.tokenize(line)
        )

    return char_sentence_sentiments

In [106]:
def merge_character_sentence_sentiments(target_map, added_map):
    """
    Given two maps from characters to lists of sentence sentiments, combine all lists with the same key.
    The first argument will be modified.
    """
    for char in added_map.keys():
        target_map[char].extend(added_map[char])


In [107]:
def get_characters_and_sentiment_frequencies(char_sentence_sentiments):
    """
    Given a map from characters to lists of sentence sentiments, return a map
    from characters to lists of the frequency of positive, negative, and neutral
    sentiments.
    """
    sentiment_freqs = defaultdict(lambda: defaultdict(int))
    for char, sentiments in char_sentence_sentiments.items():
        for (scores, _) in sentiments:
            sentiment_freqs[char][
                'pos' if scores['compound'] > 0.05 else 'neg' if scores['compound'] < -0.05 else 'neu'
            ] += 1/len(sentiments)
    return sentiment_freqs

In [108]:
season_bows_collection = defaultdict(list)
season_bows = {}
season_tfidf = {}
season_bayes = {}
total_word_count = {}
char_sentence_sentiments = defaultdict(list)
for season in range(1, 4):
    for episode in range(1, 21):
        ep_num = f'{season}{str(episode).zfill(2)}'
        with open(f'transcripts/{ep_num}.txt') as f:
            script_lines = f.readlines()
            season_bows_collection[season].append(get_ep_character_bows(script_lines))
            merge_character_sentence_sentiments(char_sentence_sentiments, get_character_sentence_sentiments(script_lines))
            total_word_count[ep_num] = get_ep_total_word_count(script_lines)
    season_bows[season] = combine_character_bows(season_bows_collection[season])
    season_tfidf[season] = get_words_by_tfidf(season_bows[season])
    season_bayes[season] = get_words_by_bayes(season_bows[season])
season_bows['all'] = combine_character_bows([bows for s in range(1, 4) for bows in season_bows_collection[s]])
season_tfidf['all'] = get_words_by_tfidf(season_bows['all'])
season_bayes['all'] = get_words_by_bayes(season_bows['all'])

In [109]:
sum(total_word_count.values())

120138

In [119]:
season_bows['all']['sokka']['weirdness']

1

In [111]:
season_bayes['all']['sokka'][:15]

[('scare', 1.0),
 ('cookin', 1.0),
 ('weirdness', 1.0),
 ('steering', 1.0),
 ('waterbended', 1.0),
 ('screw', 1.0),
 ('freakish', 1.0),
 ('ewww', 1.0),
 ('aahh', 1.0),
 ('bolt', 1.0),
 ('sneezed', 1.0),
 ('beams', 1.0),
 ('snot', 1.0),
 ('uuuuugh', 1.0),
 ('stab', 1.0)]

In [22]:
char_sentence_sentiments['ty lee']

[({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, ' Azula!'),
 ({'neg': 0.0, 'neu': 0.598, 'pos': 0.402, 'compound': 0.6166},
  ' It is so good to see you!'),
 ({'neg': 0.0, 'neu': 0.312, 'pos': 0.688, 'compound': 0.296}, ' Oh yeah.'),
 ({'neg': 0.0, 'neu': 0.445, 'pos': 0.555, 'compound': 0.5777},
  'He was so funny.'),
 ({'neg': 0.0, 'neu': 0.323, 'pos': 0.677, 'compound': 0.6369},
  ' Oh...I...uh...would love to.'),
 ({'neg': 0.0, 'neu': 0.488, 'pos': 0.512, 'compound': 0.7425},
  "But the truth is, I'm really happy here."),
 ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  'I mean, my aura has never been pinker!'),
 ({'neg': 0.0, 'neu': 0.444, 'pos': 0.556, 'compound': 0.3612},
  ' Thank you, Azula.'),
 ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  ' Uh...yeah...sure...uh...of course...'),
 ({'neg': 0.347, 'neu': 0.653, 'pos': 0.0, 'compound': -0.5023},
  " I'm sorry Azula, but unfortunately there won't be a show tomorrow."),
 ({'neg': 0.0, 'neu': 0.784,

In [23]:
get_characters_and_sentiment_frequencies(char_sentence_sentiments)

defaultdict(<function __main__.get_characters_and_sentiment_frequencies.<locals>.<lambda>()>,
            {'sokka': defaultdict(int,
                         {'neu': 0.5657534246575499,
                          'neg': 0.20410958904109833,
                          'pos': 0.23013698630137325}),
             'katara': defaultdict(int,
                         {'neu': 0.5485395189003418,
                          'neg': 0.19802405498281725,
                          'pos': 0.2534364261168377}),
             'zuko': defaultdict(int,
                         {'neu': 0.5496872828352996,
                          'pos': 0.24252953439888933,
                          'neg': 0.20778318276581034}),
             'iroh': defaultdict(int,
                         {'neu': 0.43678160919540476,
                          'pos': 0.37356321839080653,
                          'neg': 0.1896551724137935}),
             'aang': defaultdict(int,
                         {'neu': 0.573233959818546,
          

In [55]:
season_tfidf[3]['zuko'][1:15]

[('father', 95.04460042309827),
 ('uncle', 79.8199937280543),
 ('mai', 68.43211230463076),
 ('raiders', 58.964814717536434),
 ('stone', 49.13734559794703),
 ('changed', 45.62140820308717),
 ('dragon', 45.62140820308717),
 ('dragons', 45.62140820308717),
 ('history', 42.17979571865833),
 ('meeting', 42.17979571865833),
 ('firebending', 41.38814489602815),
 ('killed', 39.30987647835762),
 ('accepted', 39.30987647835762),
 ('sages', 39.30987647835762)]

In [None]:
season_tfidf['all']['suki'][1:15]

In [None]:
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)