In [87]:
from collections import Counter, defaultdict
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower()
        line = line.replace('\'', '').lower()
        # Get all words, without parentheses and ignoring punctuation
        words = re.findall(r'\w+', line)
        bow[speaker.strip()].update(words)
    return bow

In [88]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)


In [89]:
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Contains the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [90]:
s1_bows_collection = []
for episode in range(1, 21):
    ep_num = f'1{str(episode).zfill(2)}'
    with open(f'transcripts/{ep_num}.txt') as f:
        s1_bows_collection.append(get_ep_character_bows(f.readlines()))
s1_bows = combine_character_bows(s1_bows_collection)
s1_wbb = get_words_by_bayes(s1_bows)

In [94]:
# s1_wbb['Aang']
s1_wbb['aang']

 1.0),
 ('whales', 1.0),
 ('tooth', 1.0),
 ('comforting', 1.0),
 ('skins', 1.0),
 ('eel', 1.0),
 ('messenger', 1.0),
 ('trinket', 1.0),
 ('exhibits', 1.0),
 ('jugglers', 1.0),
 ('magicians', 1.0),
 ('thisd', 1.0),
 ('crowd', 1.0),
 ('coulda', 1.0),
 ('fireworks', 1.0),
 ('willing', 1.0),
 ('ookay', 1.0),
 ('squat', 1.0),
 ('fingertips', 1.0),
 ('patient', 1.0),
 ('instruction', 1.0),
 ('juggler', 1.0),
 ('iiiit', 1.0),
 ('firebend', 1.0),
 ('warmed', 1.0),
 ('ahoy', 1.0),
 ('sloppy', 1.0),
 ('grandpa', 1.0),
 ('championships', 1.0),
 ('polo', 1.0),
 ('ruined', 1.0),
 ('gliders', 1.0),
 ('teos', 1.0),
 ('nightmare', 1.0),
 ('deals', 1.0),
 ('retreating', 1.0),
 ('hermit', 1.0),
 ('crab', 1.0),
 ('poophead', 1.0),
 ('sinking', 1.0),
 ('floating', 1.0),
 ('showing', 1.0),
 ('dozen', 1.0),
 ('grass', 1.0),
 ('tranquil', 1.0),
 ('bout', 1.0),
 ('disturb', 1.0),
 ('whew', 1.0),
 ('curly', 1.0),
 ('tailed', 1.0),
 ('including', 1.0),
 ('chaos', 1.0),
 ('buddy', 0.875),
 ('ride', 0.81818181818