# Frasier Analysis

I recently stumbled on [this post](https://www.reddit.com/r/dataisbeautiful/comments/8a4gbr/the_office_characters_most_distinguishing_words_oc/) on /r/dataisbeautiful, and, as I'm a big _Frasier_ fan, was determined to do similar analysis but for _Frasier_ characters. Luckily, there's a magnificent site [here](http://www.kacl780.net/frasier/transcripts/) which has transcripts of ALL _Frasier_ episodes. First, let's get all the URLs to the transcripts.

I next use Beautiful Soup to convert the string html_str to an HTML tree.

In [48]:
import urllib.request
from bs4 import BeautifulSoup

def url_to_soup(url):
    fp = urllib.request.urlopen(url)
    html_str = fp.read()
    fp.close()
    return BeautifulSoup(html_str, 'html.parser') 

main_page_soup = url_to_soup("http://www.kacl780.net/frasier/transcripts/")
urls = [a.get('href') for a in main_page_soup.find_all('a')]
urls = ['http://www.kacl780.net' + u for u in urls if 'episode' in u]

print(urls)

['http://www.kacl780.net/frasier/transcripts/season_1/episode_1/the_good_son.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_2/space_quest.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_3/dinner_at_eight.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_4/i_hate_frasier_crane.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_5/heres_looking_at_you.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_6/the_crucible.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_7/call_me_irresponsible.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_8/beloved_infidel.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_9/selling_out.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_10/oops.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episode_11/death_becomes_him.html', 'http://www.kacl780.net/frasier/transcripts/season_1/episo

The transcript is formatted such that each character's lines are prefaced by their name in __bold__. I thus find all the elements with tag 'b', and get all the text in between. Of course, there is some bolded text that isn't a character's lines, but that will be filtered out later.

In [131]:
from string import punctuation
import re
from collections import defaultdict

def clean_string(S):
    return re.sub('['+punctuation+']', '', S).strip().lower()

main_characters = set(['frasier', 'niles', 'martin', 'roz', 'daphne'])
d_num_lines = defaultdict(int)
d_num_words = defaultdict(int)
d_overall_word_count = defaultdict(int)

def url_to_lines_dict(url):
    soup = url_to_soup(url)
    bold_tags = soup.find_all('b')
    character_d = defaultdict(dict)
    for bt in bold_tags:
        if bt.string == None:
            continue
        character = clean_string(bt.string)
        if character not in main_characters:
            continue
        #print(character)
        ns = bt.next_sibling
        words_in_line = []
        # keep iterating until we hit the next b tag
        while ns != None and ns.name != 'b':
            # dialogue has name == None, while other cues (stage directions, tone) are italicized
            if ns.name == None:
                words_in_line += clean_string(ns.string).split()
            ns = ns.next_sibling
        d_character_word_count = character_d[character]
        for word in words_in_line:
            d_overall_word_count[word] += 1
            if word not in d_character_word_count:
                d_character_word_count[word] = 0
            d_character_word_count[word] += 1
        character_d[character] = d_character_word_count
        d_num_words[character] += len(words_in_line)
        d_num_lines[character] += 1
    return character_d

d = url_to_lines_dict(urls[1])
print(len(d['frasier']))
print(d_num_lines, d_num_words)
# number of times anyone says Frasier
print(d_overall_word_count['frasier'])

539
defaultdict(<class 'int'>, {'daphne': 14, 'frasier': 85, 'martin': 47, 'roz': 9, 'niles': 10}) defaultdict(<class 'int'>, {'daphne': 319, 'frasier': 1530, 'martin': 536, 'roz': 146, 'niles': 132})
6


In [132]:
d = defaultdict(dict)
for u in urls:
    for character, word_count_d in url_to_lines_dict(u).items():
        character_word_count_d = d[character]
        for word, count in word_count_d.items():
            if word not in character_word_count_d:
                character_word_count_d[word] = 0
            character_word_count_d[word] += count

# number of times Frasier says 'Niles'
print(d['frasier']['niles'])
print(len(d['frasier']))

2167
15796


In [143]:
from nltk.corpus import stopwords

stop_words = set(clean_string(sw) for sw in stopwords.words('english'))
print(stop_words)

for character, word_count_d in d.items():
    for sw in stop_words:
        if sw in word_count_d:
            del word_count_d[sw]

{'where', 'aren', 'wont', 'further', 'between', 'once', 'couldnt', 'own', 'ours', 'does', 'mustnt', 'before', 'my', 'itself', 'youll', 'they', 'such', 'until', 'won', 'being', 'hadn', 'am', 'so', 'only', 'in', 'hasnt', 'had', 'against', 'you', 'isn', 'most', 'again', 'haven', 'couldn', 'if', 'as', 'doesn', 'shan', 'both', 'have', 'then', 'all', 'youre', 'same', 'those', 'i', 'through', 'is', 'did', 'wasnt', 'doesnt', 'any', 'm', 'below', 'why', 'hasn', 'how', 'o', 'youve', 'himself', 'are', 'with', 'from', 'ma', 'your', 'nor', 've', 'will', 'shes', 'too', 'yourself', 'into', 'because', 'were', 'him', 'than', 'after', 'out', 'doing', 'mightnt', 'not', 'about', 'or', 'whom', 'these', 's', 'of', 'be', 'd', 'weren', 't', 'herself', 'shant', 'theirs', 'me', 'this', 'it', 'he', 'themselves', 'myself', 'needn', 'her', 'should', 'hadnt', 'do', 're', 'wouldn', 'been', 'down', 'over', 'shouldn', 'youd', 'while', 'arent', 'no', 'few', 'its', 'which', 'under', 'for', 'can', 'each', 'when', 'the', 

In [145]:
total_words = sum(d_num_words.values())
print(total_words)

for c in main_characters:
    sorted_x = sorted(d[c].items(), key=lambda kv: kv[1])
    ten_top_values = sorted_x[-10:]
    print(c, ten_top_values)
# can delete top values as alternative to stop words

647631
roz [('right', 210), ('one', 212), ('go', 217), ('really', 220), ('like', 239), ('get', 250), ('know', 406), ('well', 455), ('im', 464), ('frasier', 510)]
martin [('thats', 415), ('got', 439), ('like', 483), ('right', 507), ('get', 511), ('hey', 514), ('yeah', 652), ('im', 709), ('know', 797), ('well', 1120)]
daphne [('ill', 234), ('yes', 238), ('get', 255), ('right', 290), ('like', 313), ('know', 379), ('dr', 484), ('im', 560), ('crane', 564), ('well', 581)]
frasier [('like', 959), ('thats', 959), ('see', 1004), ('roz', 1312), ('right', 1669), ('dad', 1731), ('yes', 2093), ('niles', 2167), ('know', 2295), ('im', 2487)]
niles [('going', 434), ('one', 436), ('right', 505), ('daphne', 531), ('dad', 621), ('yes', 652), ('know', 654), ('frasier', 947), ('im', 1072), ('well', 1393)]


In [151]:
import heapq

total_lines = sum(d_num_lines.values())

for c in main_characters:
    heap = []
    weight = d_num_words[c] / total_words
    weight2 = d_num_lines[c] / total_lines
    for word, count in d[c].items():
        word_freq_perc = (count**2) * weight2 / d_overall_word_count[word]
        if len(heap) < 10:
            heapq.heappush(heap, (word_freq_perc, count, word))
        else:
            wfp_low, c_low, w_low = heapq.heappop(heap)
            if wfp < word_freq_perc:
                wfp_low, c_low, w_low = word_freq_perc, count, word
            heapq.heappush(heap, (wfp_low, c_low, w_low))
    print(c)
    for elt in heap:
        print(elt[1], elt[2])
    print('--------------------')

roz
1 russell
2 kirkland
14 four
83 yes
14 cut
59 crane
46 dr
67 line
26 news
11 thirty
--------------------
martin
483 like
179 guys
797 know
439 got
511 get
1120 well
514 hey
221 eddie
652 yeah
147 fras
--------------------
daphne
12 caught
4 biscuit
28 hand
121 thank
27 moon
560 im
99 nice
175 hello
564 crane
484 dr
--------------------
frasier
120 kenny
643 please
1312 roz
2295 know
1004 see
1731 dad
2093 yes
2487 im
2167 niles
1669 right
--------------------
niles
436 one
654 know
531 daphne
621 dad
434 going
947 frasier
343 maris
1072 im
1393 well
652 yes
--------------------
