## Title: TBA

In [40]:
import requests, re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [20]:
#website for the John Wick script
site = 'https://www.imsdb.com/scripts/John-Wick.html'

In [21]:
#lists for later
lines_to_send = []
words_to_send = []

In [22]:
#get the website, parse it with beautiful soup:
r = requests.get(site)
soup = BeautifulSoup(r.text, 'html.parser')

In [23]:
#<pre>...</pre> contains all of the script's content
full_script = soup.find_all("pre")
full_script = str(full_script).split('\n')

In [24]:
#remove all of the excess spaces in the script (there's plenty)
for i, text in enumerate(full_script):
    full_script[i] = re.sub(' +', ' ', text)

In [25]:
for line, text in enumerate(full_script):
    if '<b> JOHN' in text:
        
        #Keanu's dialogue is one line below <b> JOHN, so we set the target to that
        target = line + 1
        
        #while we haven't yet hit a blank empty line...
        while full_script[target] != ' \r':
            
            #skip lines containing directions (WHISPERS)
            if '(' in full_script[target]:
                target += 1
            
            #break on totally blank lines
            if full_script[target] == '</b> \r' or full_script[target] == '</b>\r':
                break
            
            #break on more totally blank lines
            if full_script[target] == '\r':
                break
                 
            #if the line contains a <b>, something screwy is going on
            if '<b>' in full_script[target]:
                target += 1
            
            #otherwise, that's probably the lines we want
            else:
                lines_to_send.append(str(full_script[target].replace('</b>', '')).replace('\r', ''))
                target += 1
        

In [26]:
#what our lines look like...
lines_to_send

[' This is John.',
 ' Ok.',
 ' ...it had to be you...',
 " ...be seein' ya'...",
 " Be seein' you.",
 ' Yes?',
 ' Oh. Sorry.',
 ' Well played, Norma.',
 ' Well played.',
 " So... you gotta' name?",
 ' Moose.',
 ' Seriously?',
 ' All right, then...',
 ' ...Moose, it is.',
 " I'm up, I'm up.",
 " That oughta' do it.",
 " Wanna' try it out?",
 ' Good girl, Moose. Good girl.',
 ' Thanks.',
 " It ain't for sale, kid.",
 " Maybe so... but I don't.",
 ' Come on, then.',
 ' Good night, Moose.',
 ' Do you need to go out?',
 ' So could I, it would seem...',
 " What's gotten into y-",
 ' Moose!',
 ' ...Moose...',
 ' Moose...',
 ' Hello, Aurelio.',
 ' Have you seen my car?',
 ' Where is it?',
 ' Thank you.',
 ' Aurelio...',
 ' ...they killed my dog.',
 ' Aren\'t "they" always...',
 ' Where can I find Iosef Tarasov?',
 ' Thanks.',
 " Evenin', Ed.",
 ' No...just sorting out a few things',
 ' with the Russian mob.',
 ' Thanks, Ed... but you still owe me.',
 ' Good night, Ed.',
 ' Yeah, that was me.',

In [27]:
#split lines_to_send into words_to_send
for line in lines_to_send:
    
    for item in line.split(' '):
        
        #if the item is empty, ditch it
        if item == '' or item == '-':
            pass
        
        #if the item is an interrupted word, starting or ending with - then pass
        elif item[:1] == '-' or item[-1:] == '-':
            pass
        
        #otherwise, clean and send it:
        else:
            
            #hacky, but removes junk & force lower
            item = item.replace('...', '').replace('?', '').replace('!', '').replace('.', ''). replace(',', '').replace('"', '')
            item = item.lower()
            
            #add it to the list
            words_to_send.append(item)

In [28]:
#here's what our words look like...
words_to_send

['this',
 'is',
 'john',
 'ok',
 'it',
 'had',
 'to',
 'be',
 'you',
 'be',
 "seein'",
 "ya'",
 'be',
 "seein'",
 'you',
 'yes',
 'oh',
 'sorry',
 'well',
 'played',
 'norma',
 'well',
 'played',
 'so',
 'you',
 "gotta'",
 'name',
 'moose',
 'seriously',
 'all',
 'right',
 'then',
 'moose',
 'it',
 'is',
 "i'm",
 'up',
 "i'm",
 'up',
 'that',
 "oughta'",
 'do',
 'it',
 "wanna'",
 'try',
 'it',
 'out',
 'good',
 'girl',
 'moose',
 'good',
 'girl',
 'thanks',
 'it',
 "ain't",
 'for',
 'sale',
 'kid',
 'maybe',
 'so',
 'but',
 'i',
 "don't",
 'come',
 'on',
 'then',
 'good',
 'night',
 'moose',
 'do',
 'you',
 'need',
 'to',
 'go',
 'out',
 'so',
 'could',
 'i',
 'it',
 'would',
 'seem',
 "what's",
 'gotten',
 'into',
 'moose',
 'moose',
 'moose',
 'hello',
 'aurelio',
 'have',
 'you',
 'seen',
 'my',
 'car',
 'where',
 'is',
 'it',
 'thank',
 'you',
 'aurelio',
 'they',
 'killed',
 'my',
 'dog',
 "aren't",
 'they',
 'always',
 'where',
 'can',
 'i',
 'find',
 'iosef',
 'tarasov',
 'thank

In [29]:
test_array = np.array(words_to_send)

In [30]:
test_array.sort()

In [31]:
len(np.unique(test_array))

334

In [44]:
(unique, counts) = np.unique(test_array, return_counts=True)

frequencies = np.asarray((unique, counts)).T

In [48]:
df = pd.DataFrame(frequencies)

In [53]:
df.rename(columns={0:'word', 1:'count'}, inplace=True)

In [64]:
df['count'] = df['count'].astype('int32')

In [67]:
df.sort_values('count', ascending=False, inplace=True)

In [73]:
df

Unnamed: 0,word,count
329,you,33
133,i,24
0,a,22
145,it,18
199,no,16
...,...,...
132,hurting,1
138,imagine,1
140,into,1
142,irish,1
