# Frasier Text Analysis

I recently stumbled on [this post](https://www.reddit.com/r/dataisbeautiful/comments/8a4gbr/the_office_characters_most_distinguishing_words_oc/) on /r/dataisbeautiful. I'm a big _Frasier_ fan, so I was determined to do similar analysis, but for _Frasier_ characters.

## Getting the dialogue

Luckily for me, there's a magnificent site [here](http://www.kacl780.net/frasier/transcripts/) which has transcripts of ALL the _Frasier_ episodes! First, let's get the links to all the episodes from the [home page](http://www.kacl780.net/frasier/transcripts/). We use BeautifulSoup to get the HTML as a nested data structure, and pull all the links that have "episode" in them.

In [3]:
import urllib.request
from bs4 import BeautifulSoup

def url_to_soup(url):
    fp = urllib.request.urlopen(url)
    html_str = fp.read()
    fp.close()
    return BeautifulSoup(html_str, 'html.parser') 

main_page_soup = url_to_soup("http://www.kacl780.net/frasier/transcripts/")
urls = [a.get('href') for a in main_page_soup.find_all('a')]
urls = ['http://www.kacl780.net' + u for u in urls if 'episode' in u]

print('ep 1 transcript URL: ' + urls[0])
print('number of episodes: ' + str(len(urls)))

ep 1 transcript URL: http://www.kacl780.net/frasier/transcripts/season_1/episode_1/the_good_son.html
number of episodes: 264


Now, let's parse out the dialogue from each of the transcripts. 

The transcript is formatted such that each character's lines are prefaced by their name in __bold__. I thus find all the elements with tag 'b', and get all the text in between. Of course, there is some bolded text that isn't a character's lines, but that will be filtered out later.

In [4]:
from string import punctuation
import re
from collections import defaultdict

# given a string S:
# (1) remove all punctuation
# (2) strip trailing whitespace
# (3) lowercase all the letters 
def clean_string(S):
    return re.sub('['+punctuation+']', '', S).strip().lower()

main_characters = set(['frasier', 'niles', 'martin', 'roz', 'daphne'])
d_num_lines = defaultdict(int)
d_num_words = defaultdict(int)
d_overall_word_count = defaultdict(int)

# given a bold tag bt:
# return dialogue line as list of cleaned words    
def get_dialogue_following_bold_tag(bt):
    if bt.string == None:
        return []
    words_in_line = []
    ns = bt.next_sibling
    while ns != None and ns.name != 'b':
        # dialogue has no tag associated with it
        # other cues (like stage directions and tone) are italicized with tag i
        if ns.name == None:
            words_in_line += clean_string(ns.string).split()
        # move onto the next sibling
        ns = ns.next_sibling
    return words_in_line

def url_to_lines_dict(url):
    soup = url_to_soup(url)
    bold_tags = soup.find_all('b')
    character_d = defaultdict(dict)
    for bt in bold_tags:
        if bt.string == None:
            continue
        character = clean_string(bt.string)
        if character not in main_characters:
            continue
        words_in_line = get_dialogue_following_bold_tag(bt)
        d_character_word_count = character_d[character]
        for word in words_in_line:
            d_overall_word_count[word] += 1
            if word not in d_character_word_count:
                d_character_word_count[word] = 0
            d_character_word_count[word] += 1
        character_d[character] = d_character_word_count
        d_num_words[character] += len(words_in_line)
        d_num_lines[character] += 1
    return character_d

d = url_to_lines_dict(urls[1])
print(len(d['frasier']))
print(d_num_lines, d_num_words)
# number of times anyone says Frasier
print(d_overall_word_count['frasier'])

539
defaultdict(<class 'int'>, {'daphne': 14, 'frasier': 85, 'martin': 47, 'roz': 9, 'niles': 10}) defaultdict(<class 'int'>, {'daphne': 319, 'frasier': 1530, 'martin': 536, 'roz': 146, 'niles': 132})
6


In [132]:
d = defaultdict(dict)
for u in urls:
    for character, word_count_d in url_to_lines_dict(u).items():
        character_word_count_d = d[character]
        for word, count in word_count_d.items():
            if word not in character_word_count_d:
                character_word_count_d[word] = 0
            character_word_count_d[word] += count

# number of times Frasier says 'Niles'
print(d['frasier']['niles'])
print(len(d['frasier']))

2167
15796


In [143]:
from nltk.corpus import stopwords

stop_words = set(clean_string(sw) for sw in stopwords.words('english'))
print(stop_words)

for character, word_count_d in d.items():
    for sw in stop_words:
        if sw in word_count_d:
            del word_count_d[sw]

{'where', 'aren', 'wont', 'further', 'between', 'once', 'couldnt', 'own', 'ours', 'does', 'mustnt', 'before', 'my', 'itself', 'youll', 'they', 'such', 'until', 'won', 'being', 'hadn', 'am', 'so', 'only', 'in', 'hasnt', 'had', 'against', 'you', 'isn', 'most', 'again', 'haven', 'couldn', 'if', 'as', 'doesn', 'shan', 'both', 'have', 'then', 'all', 'youre', 'same', 'those', 'i', 'through', 'is', 'did', 'wasnt', 'doesnt', 'any', 'm', 'below', 'why', 'hasn', 'how', 'o', 'youve', 'himself', 'are', 'with', 'from', 'ma', 'your', 'nor', 've', 'will', 'shes', 'too', 'yourself', 'into', 'because', 'were', 'him', 'than', 'after', 'out', 'doing', 'mightnt', 'not', 'about', 'or', 'whom', 'these', 's', 'of', 'be', 'd', 'weren', 't', 'herself', 'shant', 'theirs', 'me', 'this', 'it', 'he', 'themselves', 'myself', 'needn', 'her', 'should', 'hadnt', 'do', 're', 'wouldn', 'been', 'down', 'over', 'shouldn', 'youd', 'while', 'arent', 'no', 'few', 'its', 'which', 'under', 'for', 'can', 'each', 'when', 'the', 

In [153]:
total_words = sum(d_num_words.values())
print(total_words)

for c in main_characters:
    print(d_num_words[c], d_num_lines[c])
    sorted_x = sorted(d[c].items(), key=lambda kv: kv[1])
    ten_top_values = sorted_x[-10:]
    print(c, ten_top_values)
# can delete top values as alternative to stop words

647631
56876 5012
roz [('right', 210), ('one', 212), ('go', 217), ('really', 220), ('like', 239), ('get', 250), ('know', 406), ('well', 455), ('im', 464), ('frasier', 510)]
97849 7966
martin [('thats', 415), ('got', 439), ('like', 483), ('right', 507), ('get', 511), ('hey', 514), ('yeah', 652), ('im', 709), ('know', 797), ('well', 1120)]
66500 5531
daphne [('ill', 234), ('yes', 238), ('get', 255), ('right', 290), ('like', 313), ('know', 379), ('dr', 484), ('im', 560), ('crane', 564), ('well', 581)]
295372 21892
frasier [('like', 959), ('thats', 959), ('see', 1004), ('roz', 1312), ('right', 1669), ('dad', 1731), ('yes', 2093), ('niles', 2167), ('know', 2295), ('im', 2487)]
131034 10670
niles [('going', 434), ('one', 436), ('right', 505), ('daphne', 531), ('dad', 621), ('yes', 652), ('know', 654), ('frasier', 947), ('im', 1072), ('well', 1393)]


In [157]:
import heapq

total_lines = sum(d_num_lines.values())

for c in main_characters:
    heap = []
    weight = d_num_words[c] / total_words
    weight2 = d_num_lines[c] / total_lines
    for word, count in d[c].items():
        word_freq_perc = (count**3) * weight / d_overall_word_count[word]
        if len(heap) < 10:
            heapq.heappush(heap, (word_freq_perc, count, word))
        else:
            wfp_low, c_low, w_low = heapq.heappop(heap)
            if wfp < word_freq_perc:
                wfp_low, c_low, w_low = word_freq_perc, count, word
            heapq.heappush(heap, (wfp_low, c_low, w_low))
    print(c)
    for elt in heap:
        print(elt[1], elt[2])
    print('--------------------')

roz
62 alice
239 like
208 yeah
455 well
250 get
510 frasier
220 really
464 im
406 know
194 hey
--------------------
martin
37 ronee
507 right
511 get
439 got
483 like
652 yeah
797 know
1120 well
709 im
514 hey
--------------------
daphne
30 simon
255 get
313 like
234 ill
290 right
560 im
484 dr
564 crane
581 well
379 know
--------------------
frasier
30 ronee
959 thats
1312 roz
1731 dad
1004 see
2093 yes
2295 know
2487 im
2167 niles
1669 right
--------------------
niles
67 mel
434 going
652 yes
621 dad
654 know
1072 im
1393 well
947 frasier
531 daphne
343 maris
--------------------
