In [59]:
important_characters = {
    'sokka', 'aang', 'katara', 'zuko', 'toph', 'iroh', 'zuko', 'azula', # 1000+ non-stopwords
    'zhao', 'jet', 'suki', 'hakoda', 'bumi',                            # 400+ non-stopwords
}
minor_characters = {
    'hama', 'ozai', 'guru pathik', 'mai', 'roku', 'long feng',          # 350+ non-stopwords
    'the mechanist', 'piandao', 'warden', 'bato', 'ty lee',
    'jeong jeong', 'pakku', 'zhang leader', 'earth king',
    'professor zei', 'joo dee', 'chong', 'yue',                         # 200+ non-stopwords
    'appa', 'momo'                                                      # :)
}
aliases = {
    'fire lord ozai': 'ozai', 'ruko': 'roku', 'princess yue': 'yue', 'princess yue / moon spirit': 'yue', 'monk gyatso': 'gyatso',
    'themechanist': 'the mechanist', 'zuko/blue spirit': 'zuko', 'king bumi': 'bumi', 'master pakku': 'pakku'
}

In [60]:
from collections import Counter, defaultdict
from nltk.corpus import stopwords
import re

def get_ep_character_bows(script_lines):
    """Return a set of bags of words for this episode as a mapping from characters to words to counts."""
    bow = defaultdict(Counter)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if speaker not in important_characters:
            continue
        line = line.lower()
        # Get all non-stop words
        stop_words = set(w for w in stopwords.words('english') 
            + ['us', 'get', 'like', 'thats', 'go', 'going', 'cant', 'yeh', 'oh', 'got', 'hey', 
               'yeah', 'uh', 'whats', 'could', 'shall', 'gonna', 'okay', 'one'] 
        )
        words = re.findall(r'\w+', line)
        bow[speaker]['total_count'] += len(words)
        words = [w for w in words if w not in stop_words]
        bow[speaker].update(words)
    return bow

In [61]:
def combine_bows(bows):
    """Merge a collection of bags of words into one. Does not modify the input."""
    combined_bow = Counter()
    for bow in bows:
        combined_bow.update(bow)
    return combined_bow

def combine_character_bows(character_bows):
    """Merge a collection of mappings from characters to bags of words into one. Does not modify the input."""
    all_characters = set(k for bow in character_bows for k in bow.keys())
    combined_bow_set = {c: combine_bows(bow_map[c] for bow_map in character_bows) for c in all_characters}
    return defaultdict(Counter, combined_bow_set)


In [62]:
def get_words_by_bayes(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, p), where w is a word 
    and p is the probability that a random occurrence of the word was spoken by that character.
    The lists are sorted by descending probability.
    """
    # Compute the total number of occurrences for each word
    combined_bow = combine_bows(character_bows.values())
    # p(character | word) = [# times character says word] / [# occurrences of word]
    words_by_bayes = {
        c: sorted(
            [(w, character_bows[c][w] / combined_bow[w]) for w in combined_bow.keys()],
            key=lambda wp: wp[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_bayes

In [63]:
from math import log
# Since we have very few characters, we want to weight idf highly (with a lower base)
base = 1.2

def get_words_by_tfidf(character_bows):
    """
    Given a mapping from characters to bags of words (for an episode, season, etc), 
    return a mapping from each character to a list of pairs (w, s), where w is a word 
    and s is the tf-idf score of that word for that character. The lists are sorted 
    by descending score.
    """
    # Compute the number of characters that said each word
    all_words = set(k for bow in character_bows.values() for k in bow.keys())
    n_chars = len(character_bows.keys())
    idf = {
        w: 1 + log(n_chars / (1 + sum(character_bows[c][w] > 0 for c in character_bows.keys())), base)
        for w in all_words
    }
    # tf-idf = tf(w, c) / (1 + log(N / (1 + df(w))))
    words_by_tfidf = {
        c: sorted(
            [(w, (1 + v) * idf[w]) for w, v in character_bows[c].items()],
            key=lambda ws: ws[1],
            reverse=True
        ) for c in character_bows.keys()
    }
    return words_by_tfidf

In [71]:
import nltk.data
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import sentiment
from nltk import word_tokenize

def get_character_sentence_sentiments(script_lines):
    """
    Return a mapping from each character to a list of all their sentences along with the
    computed sentiment scores (positive, negative, neutral) for each sentence.
    """
    sid = SentimentIntensityAnalyzer()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    char_sentence_sentiments = defaultdict(list)
    for line in script_lines:
        speaker, line = line.split('::')
        speaker = speaker.lower().strip()
        speaker = aliases.get(speaker, speaker)
        if speaker not in important_characters | minor_characters:
            continue
        
        char_sentence_sentiments[speaker].extend(
            (sid.polarity_scores(sentence), sentence) for sentence in tokenizer.tokenize(line)
        )

    return char_sentence_sentiments

In [72]:
def merge_character_sentence_sentiments(target_map, added_map):
    """
    Given two maps from characters to lists of sentence sentiments, combine all lists with the same key.
    The first argument will be modified.
    """
    for char in added_map.keys():
        target_map[char].extend(added_map[char])


In [73]:
def get_characters_by_net_sentiment(char_sentence_sentiments):
    """
    Given a map from characters to lists of sentence sentiments, sort all characters descending
    by their net positive sentiment, where positive sentences contribute +1/N, negative sentences
    contribute -1/N, and neutral sentences contribute 0.
    """
    net_sentiments = defaultdict(int)
    for char, sentiments in char_sentence_sentiments.items():
        for (scores, _) in sentiments:
            net_sentiments[char] += (1 if scores['compound'] > 0.05 else -1 if scores['compound'] < -0.05 else 0)/len(sentiments)
    return sorted(net_sentiments.items(), key=lambda x: x[1], reverse=True)

In [74]:
season_bows_collection = defaultdict(list)
season_bows = {}
season_tfidf = {}
char_sentence_sentiments = defaultdict(list)
for season in range(1, 4):
    for episode in range(1, 21):
        ep_num = f'{season}{str(episode).zfill(2)}'
        with open(f'transcripts/{ep_num}.txt') as f:
            script_lines = f.readlines()
            season_bows_collection[season].append(get_ep_character_bows(script_lines))
            merge_character_sentence_sentiments(char_sentence_sentiments, get_character_sentence_sentiments(script_lines))
    season_bows[season] = combine_character_bows(season_bows_collection[season])
    season_tfidf[season] = get_words_by_tfidf(season_bows[season])
season_bows['all'] = combine_character_bows([bows for s in range(1, 4) for bows in season_bows_collection[s]])
season_tfidf['all'] = get_words_by_tfidf(season_bows['all'])

In [68]:
char_sentence_sentiments['ty lee']

[({'neg': 0.137, 'neu': 0.668, 'pos': 0.194, 'compound': 0.2363},
  " Just because you're destined to save the world, don't expect any special treatment."),
 ({'neg': 0.0, 'neu': 0.656, 'pos': 0.344, 'compound': 0.4939},
  ' If you want to relax, then I suggest visiting a tropical island.'),
 ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
  "If not, I'll see you both at sunrise."),
 ({'neg': 0.0, 'neu': 0.256, 'pos': 0.744, 'compound': 0.4404}, 'Good night.'),
 ({'neg': 0.176, 'neu': 0.64, 'pos': 0.184, 'compound': 0.0258},
  " No, please, march right in, I'm not concentrating or anything."),
 ({'neg': 0.565, 'neu': 0.435, 'pos': 0.0, 'compound': -0.0772},
  " I'm sorry."),
 ({'neg': 0.483, 'neu': 0.517, 'pos': 0.0, 'compound': -0.4215},
  "I think there's been a misunderstanding."),
 ({'neg': 0.0, 'neu': 0.686, 'pos': 0.314, 'compound': 0.4939},
  "You didn't tell me your friend was a girl."),
 ({'neg': 0.219, 'neu': 0.781, 'pos': 0.0, 'compound': -0.4215},
  'In our tribe, i

In [69]:
get_characters_by_net_sentiment(char_sentence_sentiments)

[('ty lee', 0.38383838383838403),
 ('pakku', 0.3552631578947366),
 ('professor zei', 0.2622950819672131),
 ('iroh', 0.18390804597701185),
 ('bumi', 0.17333333333333326),
 ('piandao', 0.1645569620253165),
 ('guru pathik', 0.15463917525773196),
 ('yue', 0.15217391304347822),
 ('joo dee', 0.13793103448275865),
 ('suki', 0.11398963730569951),
 ('roku', 0.1111111111111111),
 ('toph', 0.10926365795724474),
 ('aang', 0.1059624108878811),
 ('chong', 0.1044776119402985),
 ('bato', 0.09090909090909093),
 ('earth king', 0.09090909090909091),
 ('hama', 0.08080808080808081),
 ('azula', 0.07707910750507099),
 ('hakoda', 0.05797101449275362),
 ('katara', 0.055412371134020484),
 ('ozai', 0.052173913043478265),
 ('zuko', 0.034746351633078515),
 ('sokka', 0.02602739726027394),
 ('jet', 0.019801980198019802),
 ('momo', 0.0),
 ('zhao', -0.0179372197309417),
 ('warden', -0.01818181818181818),
 ('mai', -0.023076923076923078),
 ('jeong jeong', -0.07317073170731707),
 ('long feng', -0.07368421052631578),
 ('t

In [55]:
season_tfidf[3]['zuko'][1:15]

[('father', 95.04460042309827),
 ('uncle', 79.8199937280543),
 ('mai', 68.43211230463076),
 ('raiders', 58.964814717536434),
 ('stone', 49.13734559794703),
 ('changed', 45.62140820308717),
 ('dragon', 45.62140820308717),
 ('dragons', 45.62140820308717),
 ('history', 42.17979571865833),
 ('meeting', 42.17979571865833),
 ('firebending', 41.38814489602815),
 ('killed', 39.30987647835762),
 ('accepted', 39.30987647835762),
 ('sages', 39.30987647835762)]

In [None]:
season_tfidf['all']['suki'][1:15]

In [None]:
sorted(
    ((char, count) 
        for char in season_bows['all'].keys()
        if (count := season_bows['all'][char]['total_count'])),
    key=lambda x: x[1],
    reverse=True
)