In [905]:
important_characters = [
    'sokka', 'aang', 'katara', 'zuko', 'toph', 'iroh', 'zuko', 'azula', # 1000+ non-stopwords
    'zhao', 'jet', 'suki', 'hakoda', 'bumi',                            # 400+ non-stopwords
    'hama', 'ozai', 'guru pathik', 'mai', 'roku', 'long feng',          # 350+ non-stopwords
    'the mechanist', 'piandao', 'warden', 'bato', 'ty lee',
    # 'jeong jeong', 'pakku', 'zhang leader', 'earth king',
    # 'professor zei', 'joo dee', 'chong', 'yue',                         # 200+ non-stopwords
    # 'appa', 'momo'                                                      # :)
]
aliases = {
    'fire lord ozai': 'ozai', 'ruko': 'roku', 'princess yue': 'yue', 'princess yue / moon spirit': 'yue', 'monk gyatso': 'gyatso',
    'themechanist': 'the mechanist', 'zuko/blue spirit': 'zuko', 'king bumi': 'bumi', 'master pakku': 'pakku'
}

In [906]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if speaker not in important_characters:
            continue
        line = line.lower()
        # Get all non-stop words
        stop_words = set(w for w in stopwords.words('english') 
            + ['us', 'get', 'like', 'thats', 'go', 'going', 'cant', 'yeh', 'oh', 'got', 'hey', 
               'yeah', 'uh', 'whats', 'could', 'shall', 'gonna', 'okay', 'one'] 
        )
        words = re.findall(r'\w+', line)
        bow[speaker]['total_count'] += len(words)
        words = [w for w in words if w not in stop_words]
        bow[speaker].update(words)
    return bow

In [907]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)


In [908]:
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Compute the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [927]:
from math import log
# Since we have very few characters, we want to weight idf highly (with a lower base)
base = 1.3

def get_words_by_tfidf(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, s), where w is a word 
    and s is the tf-idf score of that word for that character. The lists are sorted 
    by descending score.
    """
    # Compute the number of characters that said each word
    all_words = set(k for bow in character_bows.values() for k in bow.keys())
    n_chars = len(character_bows.keys())
    idf = {
        w: 1 + log(n_chars / (1 + sum(character_bows[c][w] > 0 for c in character_bows.keys())), base)
        for w in all_words
    }
    # tf-idf = tf(w, c) / (1 + log(N / (1 + df(w))))
    words_by_tfidf = {
        c: sorted(
            [(w, (1 + v) * idf[w]) for w, v in character_bows[c].items()],
            key=lambda ws: ws[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_tfidf

In [928]:
season_bows_collection = defaultdict(list)
season_bows = {}
season_tfidf = {}
for season in range(1, 4):
    for episode in range(1, 21):
        ep_num = f'{season}{str(episode).zfill(2)}'
        with open(f'transcripts/{ep_num}.txt') as f:
            season_bows_collection[season].append(get_ep_character_bows(f.readlines()))
    season_bows[season] = combine_character_bows(season_bows_collection[season])
    season_tfidf[season] = get_words_by_tfidf(season_bows[season])
season_bows['all'] = combine_character_bows([bows for s in range(1, 4) for bows in season_bows_collection[s]])
season_tfidf['all'] = get_words_by_tfidf(season_bows['all'])

In [929]:
season_tfidf[1]['zuko'][1:15]

[('uncle', 256.8371058658668),
 ('avatar', 55.51531380622062),
 ('ship', 47.54313991570178),
 ('zhao', 42.95382034794852),
 ('hiding', 35.67182025914817),
 ('trail', 35.67182025914817),
 ('helmsman', 34.719168628118204),
 ('need', 30.55015405801113),
 ('capture', 30.189326806092055),
 ('lose', 30.189326806092055),
 ('safety', 28.537456207318538),
 ('underestimated', 26.03937647108865),
 ('throne', 26.03937647108865),
 ('sealed', 26.03937647108865)]

In [937]:
season_tfidf['all']['suki'][1:15]

[('unagi', 35.05425857283129),
 ('sokka', 34.920915662359135),
 ('appa', 32.03346819537735),
 ('warrior', 25.047772179468616),
 ('gondola', 23.001197857789794),
 ('throwing', 20.617985496815475),
 ('uniform', 20.617985496815475),
 ('threads', 20.617985496815475),
 ('symbolizes', 20.617985496815475),
 ('veins', 20.617985496815475),
 ('represents', 20.617985496815475),
 ('loosen', 20.617985496815475),
 ('crabby', 20.617985496815475),
 ('sleeveless', 20.617985496815475)]

In [882]:
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)

[('sokka', 19028),
 ('aang', 18858),
 ('katara', 15472),
 ('zuko', 9694),
 ('toph', 5677),
 ('iroh', 5437),
 ('azula', 3808),
 ('zhao', 1679),
 ('jet', 1444),
 ('suki', 1278),
 ('hakoda', 1105),
 ('bumi', 1045)]