In [1]:
!pip install html2text

Collecting html2text
  Downloading https://files.pythonhosted.org/packages/16/20/de2b458ef434713053dd83209a03a5431ebe0527c8e14d9ae7838ff67d8a/html2text-2018.1.9-py3-none-any.whl
Installing collected packages: html2text
Successfully installed html2text-2018.1.9


In [0]:
import re
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

from tqdm import tqdm, tqdm_notebook
import html2text

from itertools import chain
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

import textwrap

In [3]:
resp = get('http://www.seinfeldscripts.com/seinfeld-scripts.html', headers={"content-type":"text/html", "User-Agent": "Mozilla Firefox"})
resp.close()
resp

<Response [200]>

In [0]:
html = BeautifulSoup(resp.content, 'html.parser')

In [5]:
episode_names, episode_urls = zip(*[(e.text.replace('\n',''), 'http://seinfeldscripts.com/{}'.format(e.attrs['href'].strip())) for e in html.select('table:nth-of-type(2) a') ])
len(episode_urls)

180

In [6]:
all_episodes_raw_htmls = list()
for url in tqdm_notebook(episode_urls):
  try:
    resp = get(url, headers={'content-type': 'text/html;', 'User-Agent': 'Mozilla Firefox'})
    if not resp.ok:
      resp.close()
      print(f"Failed at episode number {i} ({e})")
    all_episodes_raw_htmls.append(resp.content)
  finally:
    resp.close()

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [0]:
h = html2text.HTML2Text()
h.ignore_emphasis = True
h.ignore_images = True
h.ignore_links = True
h.ignore_tables = True

In [8]:
raw_texts = list()
for raw_html in tqdm_notebook(all_episodes_raw_htmls):
  raw_texts.append(h.handle(raw_html.decode()).strip().split('\n'))

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [11]:
# c = Counter()
# for episode in raw_texts:
#   c.update(episode)
# c.most_common(10)

c = Counter(l for episode in raw_texts for l in episode)
c.most_common(10)

[('', 62594),
 ('  ', 21522),
 ('    ', 369),
 ('JERRY', 344),
 ('(Scene ends)', 258),
 ('ELAINE', 256),
 ('KRAMER', 238),
 ('New scene.  ', 210),
 ('GEORGE', 185),
 ('Looking for a great gift idea for the holidays?  ', 182)]

In [12]:
# sum(1 for e,ct in c.items() if ct>1 )
ser = pd.Series(list(c.values()))
ser.describe()

count    84805.000000
mean         2.189317
std        227.322782
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max      62594.000000
dtype: float64

In [13]:
{e for e,ct in c.items() if ct>3 and ct < 10 and not re.match('([a-z])+: ', e.lower().strip()) }

{'"Well,',
 '% Opening monologue',
 "'ting' of a microwave.",
 '(Enter Kramer)',
 '(George enters)',
 '(George enters)  ',
 '(Kramer enters the apartment)',
 '(Kramer walks in)',
 '(Scene ends.)',
 '(Scene: At Elaine\x92s apartment)',
 '(Scene: At Jerry\x92s apartment)',
 "(Scene: Jerry's Apartment)  ",
 '(commercial)',
 '(end of flashback)',
 "(hurriedly) Why d'you wear the same dress all the time? Hello.",
 "(listens) Oh really? Well you can't break up with me over the phone.",
 '(phone rings)',
 '(pointedly) By myself.',
 '*knock, knock, knock*',
 '.',
 '...',
 '<Elaine enters>',
 '<Spellchecked and reformatted by Mike "The News Guy">  ',
 '<break>',
 'A kid leans out the window of a parked Volvo.',
 'A member of staff approaches.',
 'A sandwich of epic proportions. He hums to himself as he piles sliced',
 'ACT ONE',
 'ALLISON',
 'AMANDA',
 'Advertisement',
 'Along the sidewalk comes a happy-looking Newman, pedalling the Schwinn',
 'Around beautiful name, it is also a living tribute

In [0]:
header_and_footer_lines = {e for e,ct in c.items() if ct>50}

def keep_line(line):
  if line in header_and_footer_lines:
    return False
  if 'spell check' in line.lower():
    return False
  if line.startswith('[End]'):
    return False
  if line.lower().startswith('end of show'):
    return False
  if '.........' in line:
    return False
  if '---------' in line:
    return False
  if line.startswith('#'):
    return False
  if line.startswith('='):
    return False
  if line.startswith('Episode'):
    return False
  if line.lower().startswith('originally aired'):
    return False
  if line.startswith('%'):
    return False
  if line.startswith('Written by'):
    return False
  if line.startswith("Directed by"):
    return False
  if line.lower().startswith("[transcribed by"):
    return False
  if line.startswith("Broadcasted:"):
    return False
  if line.startswith("Stars:"):
    return False
  if line.startswith("Jason Alexander"):
    return False
  return True

raw_texts_filt = [[l for l in raw_text if keep_line(l)] for raw_text in raw_texts]


In [0]:
episode_texts = [' '.join(raw_text) for raw_text in raw_texts_filt]

Sometimes the lines are "JERRY:..." and some times "Jerry:...". Let's replace them all with standard capitalization.

In [0]:
episode_texts = [re.sub(r'([A-Za-z]+:)', lambda m: m.group(0).capitalize() , t) for t in episode_texts]

In [67]:
print('\n'.join(textwrap.wrap(episode_texts[16],130)))

LaBiosa (as Antonio). [Setting: Night club] Jerry: I'm not a foodie. I don't, "Oh, this is too rare. Oh, it's too salty." Just eat
it and shut up. I'll eat anywhere, whatever they're having. I have eaten rotten rolls off of room service trays in hotel hallways.
I have. It's not a joke. This is my life. I don't know, somebody left it. Why would someone poison a roll, and leave it in a
hallway for some comic coming down at two o' clock in the morning? Why would they do that? Sometimes you go to a nice restaurant,
they put the check in a little book. What is this? The story of the bill? "Once upon a time, there were some very hungry people.."
What is this? A little gold tassle hanging down? Am I graduating from the restaurant? What is this about? [Setting: A restaurant]
(Jerry, George, and Elaine are all eating at an Italian restaurant. George hasn't eaten anything) Elaine: Do you want some of
mine? Jerry: Take some of mine. George: Why do I get pesto? Why do I think I'll like it? I keep 

In [68]:
from google.colab import drive
drive.mount('/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
import os

In [0]:
DATA_DIR = '/gdrive/My Drive/projects/seinfeldvision/corpus'
os.makedirs(DATA_DIR, exist_ok=True)

In [71]:
for episode_name, episode_text in tqdm_notebook(zip(episode_names, episode_texts)):
  with open(os.path.join(DATA_DIR, f'{episode_name}.txt'), 'wt') as f:
    f.write(episode_text)
len(os.listdir(DATA_DIR))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




180

In [0]:
with open('/gdrive/My Drive/projects/seinfeldvision/whole_corpus.txt', 'wt') as f:
  for t in episode_texts:
    f.write(t)
    f.write('\n')