In [14]:
import re
import os
import csv
import pandas as pd

In [2]:
from bs4 import BeautifulSoup
import urllib.request

In [5]:
decades = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
decades_dict = {}

In [4]:
# Takes in one of the decade CSV show lists and returns it as a formatted dataframe
def csvToDataframe(filename):
    df = pd.read_csv(filename, header=None, names=['title', 'start', 'end', 'wiki_url', 'transcript_url'])
    return df

In [7]:
for decade in decades:
    path = "src/" + decade[:-1] + "_shows.csv"
    decades_dict[decade] = csvToDataframe(path)
    print(path + " has been processed")

src/1950_shows.csv has been processed
src/1960_shows.csv has been processed
src/1970_shows.csv has been processed
src/1980_shows.csv has been processed
src/1990_shows.csv has been processed
src/2000_shows.csv has been processed
src/2010_shows.csv has been processed


In [9]:
print(decades_dict)

{'1950s':          title      start       end  \
0  I Love Lucy  Oct. 1951  May 1957   

                                    wiki_url  \
0  https://en.wikipedia.org/wiki/I_Love_Lucy   

                                      transcript_url  
0  https://www.springfieldspringfield.co.uk/episo...  , '1960s':                 title      start        end  \
0     The Flintstones  Sep. 1960  Apr. 1966   
1           Bewitched  Sep. 1964  Mar. 1972   
2   The Addams Family  Sep. 1964  Apr. 1966   
3        The Munsters  Sep. 1964   May 1966   
4           Get Smart  Sep. 1965   May 1970   
5  I Dream of Jeannie  Sep. 1965   May 1970   

                                            wiki_url  \
0      https://en.wikipedia.org/wiki/The_Flintstones   
1            https://en.wikipedia.org/wiki/Bewitched   
2  https://en.wikipedia.org/wiki/The_Addams_Famil...   
3         https://en.wikipedia.org/wiki/The_Munsters   
4            https://en.wikipedia.org/wiki/Get_Smart   
5   https://en.wikipedia.org

In [11]:
shows = []
for decade in decades:
    curr_df = decades_dict[decade]
    for i, row in curr_df.iterrows():
        shows.append(row['title'])

In [13]:
print(len(shows))

43


In [112]:
# -*- coding: utf-8 -*-
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
multiple_dots = r'\.{2,}'

def split_into_sentences(text) -> list:
    """
    Split the text into sentences.

    If the text contains substrings "<prd>" or "<stop>", they would lead 
    to incorrect splitting because they are used as markers for splitting.

    :param text: text to be split into sentences
    :type text: str

    :return: list of sentences
    :rtype: list[str]
    """
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]: sentences = sentences[:-1]
    return sentences

In [114]:
def clean(txt):
    stripped_txt = re.sub(r' +', ' ', txt)
    transcript = re.sub(r"[\(\[].*?[\)\]]", "", stripped_txt)
    remove_upper = re.sub(r'\b[A-Z]+\b\:', '', transcript)
    result = os.linesep.join([s for s in remove_upper.splitlines() if s])
    # result = split_into_lines(sep)
    return result.strip()

In [17]:
seid = "s01e14"
print("s = " + seid[1:3])
print("e = " + seid[4:])

s = 01
e = 14


In [47]:
# For a dataframe with the columns 'title' and 'transcript_url'
# Processes all shows in the dataframe, getting their episode urls, SEID's, and respective season and episode numbers
# Returns this data as a new dataframe dictionary, with the key being the show's title
# (this df will be used for populating each season's corpus, which will be placed inside the respective decade folder)
# (so this should be organized by decade)
def getEpisodes(df):
    result = pd.DataFrame(columns=['title', 'season', 'episode', 'url', 'seid'])
    for idx, row in df.iterrows():
        curr_show, curr_url = row['title'], row['transcript_url']
        page = urllib.request.urlopen(curr_url)
        soup = BeautifulSoup(page, "lxml")
        seid_list, s_list, e_list = [], [], []
        url_list = []

        for link in soup.find_all('a'):
            route = link.get('href')
            if route.startswith("view_episode_scripts"):
                seid = route[-6:]
                seid_list.append(seid)
                s_list.append(seid[1:3])
                e_list.append(seid[4:])
                url_list.append(route)
        show = pd.DataFrame(columns=['title', 'season', 'episode', 'url', 'seid'])
        show['seid'] = seid_list
        show['url'] = url_list
        show['episode'] = e_list
        show['season'] = s_list
        show['title'] = curr_show
        result = result.append(show, ignore_index=True)
    return result
        
    # return pd.DataFrame({'season': s_list, 'episode': e_list, 'url': url_list, 'seid': seid_list})

In [48]:
episodes_1950 = getEpisodes(decades_dict['1950s'])

In [49]:
episodes_1950.sample(5)

Unnamed: 0,title,season,episode,url,seid
130,I Love Lucy,5,2,view_episode_scripts.php?tv-show=i-love-lucy-1...,s05e02
16,I Love Lucy,1,16,view_episode_scripts.php?tv-show=i-love-lucy-1...,s01e16
95,I Love Lucy,3,28,view_episode_scripts.php?tv-show=i-love-lucy-1...,s03e28
176,I Love Lucy,6,22,view_episode_scripts.php?tv-show=i-love-lucy-1...,s06e22
10,I Love Lucy,1,10,view_episode_scripts.php?tv-show=i-love-lucy-1...,s01e10


In [50]:
episodes_1960 = getEpisodes(decades_dict['1960s'])
episodes_1970 = getEpisodes(decades_dict['1970s'])
episodes_1980 = getEpisodes(decades_dict['1980s'])
episodes_1990 = getEpisodes(decades_dict['1990s'])
episodes_2000 = getEpisodes(decades_dict['2000s'])
episodes_2010 = getEpisodes(decades_dict['2010s'])

In [51]:
episodes_1950.name = 'episodes_1950'
episodes_1960.name = 'episodes_1960'
episodes_1970.name = 'episodes_1970'
episodes_1980.name = 'episodes_1980'
episodes_1990.name = 'episodes_1990'
episodes_2000.name = 'episodes_2000'
episodes_2010.name = 'episodes_2010'

In [52]:
to_csv = [episodes_1950, episodes_1960, episodes_1970, episodes_1980, episodes_1990, episodes_2000, episodes_2010]

for d in to_csv:
    filename = d.name + '.csv'
    d.to_csv(filename)

In [None]:
# def get transcripts():

#base = "https://www.springfieldspringfield.co.uk/"
        # curr_show, curr_url = row['title'], base + row['transcript_url']
        # response = urllib.request.urlopen(curr_url)
        # soup = BeautifulSoup(response, 'html.parser')
        # raw_txt = soup.find("div", {"class":"scrolling-script-container"}).get_text()
        # transcript = clean(raw_txt)

In [53]:
copy_1960 = episodes_1960.copy()

In [56]:
grouped_1960 = copy_1960.groupby(['title', 'season'])

In [57]:
grouped_1960['episode'].count()

title               season
Bewitched           01        36
                    02        38
                    03        33
                    04        33
                    05        30
                    06        30
                    07        28
                    08        26
Get Smart           01        30
                    02        30
                    03        26
                    04        26
                    05        26
I Dream of Jeannie  01        30
                    02        31
                    03        26
                    04        26
                    05        26
The Addams Family   01        34
                    02        30
The Flintstones     01        25
The Munsters        01        38
Name: episode, dtype: int64

In [58]:
grouped_1960.head()

Unnamed: 0,title,season,episode,url,seid
0,The Flintstones,01,01,view_episode_scripts.php?tv-show=the-flintston...,s01e01
1,The Flintstones,01,02,view_episode_scripts.php?tv-show=the-flintston...,s01e02
2,The Flintstones,01,03,view_episode_scripts.php?tv-show=the-flintston...,s01e03
3,The Flintstones,01,04,view_episode_scripts.php?tv-show=the-flintston...,s01e04
4,The Flintstones,01,05,view_episode_scripts.php?tv-show=the-flintston...,s01e05
...,...,...,...,...,...
632,I Dream of Jeannie,05,01,view_episode_scripts.php?tv-show=i-dream-of-je...,s05e01
633,I Dream of Jeannie,05,02,view_episode_scripts.php?tv-show=i-dream-of-je...,s05e02
634,I Dream of Jeannie,05,03,view_episode_scripts.php?tv-show=i-dream-of-je...,s05e03
635,I Dream of Jeannie,05,04,view_episode_scripts.php?tv-show=i-dream-of-je...,s05e04


In [62]:
grouped_1960.first() 

Unnamed: 0_level_0,Unnamed: 1_level_0,episode,url,seid
title,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bewitched,1,1,view_episode_scripts.php?tv-show=bewitched-196...,s01e01
Bewitched,2,1,view_episode_scripts.php?tv-show=bewitched-196...,s02e01
Bewitched,3,1,view_episode_scripts.php?tv-show=bewitched-196...,s03e01
Bewitched,4,1,view_episode_scripts.php?tv-show=bewitched-196...,s04e01
Bewitched,5,1,view_episode_scripts.php?tv-show=bewitched-196...,s05e01
Bewitched,6,1,view_episode_scripts.php?tv-show=bewitched-196...,s06e01
Bewitched,7,1,view_episode_scripts.php?tv-show=bewitched-196...,s07e01
Bewitched,8,1,view_episode_scripts.php?tv-show=bewitched-196...,s08e01
Get Smart,1,1,view_episode_scripts.php?tv-show=get-smart-196...,s01e01
Get Smart,2,1,view_episode_scripts.php?tv-show=get-smart-196...,s02e01


In [90]:
# for idx, group in grouped_1960
grouped_1960.get_group(('The Flintstones', '01'))

Unnamed: 0,title,season,episode,url,seid
0,The Flintstones,1,1,view_episode_scripts.php?tv-show=the-flintston...,s01e01
1,The Flintstones,1,2,view_episode_scripts.php?tv-show=the-flintston...,s01e02
2,The Flintstones,1,3,view_episode_scripts.php?tv-show=the-flintston...,s01e03
3,The Flintstones,1,4,view_episode_scripts.php?tv-show=the-flintston...,s01e04
4,The Flintstones,1,5,view_episode_scripts.php?tv-show=the-flintston...,s01e05
5,The Flintstones,1,6,view_episode_scripts.php?tv-show=the-flintston...,s01e06
6,The Flintstones,1,7,view_episode_scripts.php?tv-show=the-flintston...,s01e07
7,The Flintstones,1,8,view_episode_scripts.php?tv-show=the-flintston...,s01e08
8,The Flintstones,1,9,view_episode_scripts.php?tv-show=the-flintston...,s01e09
9,The Flintstones,1,10,view_episode_scripts.php?tv-show=the-flintston...,s01e10


In [91]:
grouped_1960.groups

{('Bewitched', '01'): [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60], ('Bewitched', '02'): [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98], ('Bewitched', '03'): [99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131], ('Bewitched', '04'): [132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164], ('Bewitched', '05'): [165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194], ('Bewitched', '06'): [195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 21

In [93]:
for grp_idx, grp in grouped_1960:
    for row_idx, row in grp.iterrows():
        print(row['url'])

view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e01
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e02
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e03
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e04
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e05
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e06
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e07
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e08
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e09
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e10
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e11
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e12
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e13
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e14
view_episode_scripts.php?tv-show=bewitched-1964&episode=s01e15
view_episode_scripts.php?tv-show=bewitched-1964&episode

In [108]:
for grp_idx, grp in grouped_1960:
    print(grp_idx[0])

Bewitched
Bewitched
Bewitched
Bewitched
Bewitched
Bewitched
Bewitched
Bewitched
Get Smart
Get Smart
Get Smart
Get Smart
Get Smart
I Dream of Jeannie
I Dream of Jeannie
I Dream of Jeannie
I Dream of Jeannie
I Dream of Jeannie
The Addams Family
The Addams Family
The Flintstones
The Munsters


In [110]:
a_mark = "{START OF EPISODE "
z_mark = "{END OF EPISODE "
base = "https://www.springfieldspringfield.co.uk/"

In [111]:
def getScript(url):
    # Returns list of lines from episode transcript
    page = urllib.request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser')
    text = soup.find("div", {"class":"scrolling-script-container"}).get_text()
    cleaned = clean(text)
    return split_into_sentences(cleaned)

In [118]:
# Looping through each season of TV in our 1960s df
for grp_idx, grp in grouped_1960:
    title_raw, season = grp_idx[0], grp_idx[1]
    title_clean = re.sub(r' ', '_', title_raw.lower())
    filepath = title_clean + '_season' + season + '.txt'
    # open the file to begin writing to it
    f = open(filepath, 'w')

    for row_idx, row in grp.iterrows():
        # add episode start marker
        f.write(a_mark + row['episode'] + "}" + "\n")
        # get transcript
        text = getScript(base + row['url'])
        lines = split_into_sentences(text)
        # add each line to the file
        for i in range(len(lines)):
            f.write(lines[i])
            if i < len(lines) - 1:
                f.write("\n")
        # add episode end marker
        f.write("\n" + z_mark + row['episode'] + "}")
    # close the file !!!
    f.close()

In [201]:
with open('./get_smart_season03.txt') as f:
    season_corpus = f.read()

In [261]:
pattern = re.compile(r'\{START OF EPISODE \d{2}\}(.+?)\{END OF EPISODE \d{2}\}', re.DOTALL)

In [258]:
episodes = re.split(r'\{START OF EPISODE \d{2}\}(.+?)\{END OF EPISODE \d{2}\}', season_corpus, flags=re.S)

In [259]:
len(episodes)
# print(episodes)

53

In [263]:
episodes[51]
# print(season_corpus[50:100])

'\n-  -  Oh, so it\'s you, Bodecker.\nSmart, you\'re not supposed to say, "Oh, so it\'s you, Bodecker".\n- First, the password.\n- Right.\n"A wet duck only flies at midnight".\n"The night is the devil\'s playground".\n"When the swallows come back to Campobello, there\'ll be bluebirds over the white cliffs of Dover".\n- Okay, Smart.\n- Oh, so it\'s you, Bodecker.\n- No.\n- "No"?\nYou\'re always getting me and Bodecker mixed up.\nI\'m Rosencrantz.\n- Oh, so it\'s you, Rosencrantz.\n- No.\nYou\'re supposed to use my code name on this case.\n- What\'s your code name?\n- Bodecker.\n- Bodecker.\n- Do you have them?\nYes, I have them, but they\'re for your eyes alone.\nThey\'re top secret.\nHere are your sealed orders.\nBodecker: The envelope wasn\'t sealed.\nWell, my orders were that the envelope wasn\'t to be sealed.\n- Can I see your orders?\n- That\'s impossible.\n- Why?\n- Because my orders are sealed.\nThis is an odd assignment.\nWe\'re to break into this mansion and take a picture of a

In [264]:
with open('./the_addams_family_season02.txt') as fam:
    addams_corpus = fam.read()

addams = re.split(pattern, addams_corpus)

In [265]:
len(addams)

61

In [266]:
addams[59]

"\nThey're creepy and they're kooky Mysterious and spooky They're altogether ooky The Addams family The house is a museum When people come to see 'em They really are a scream The Addams family Neat.\nSweet.\nPetite.\nSo get a witch's shawl on A broomstick you can crawl on We're gonna pay a call on The Addams family Marvelous idea, Morticia, putting this old saw together again.\nI thought it would be nice for Uncle Fester.\nThis is fun.\nMakes me all goose-bumpy.\nIt's always so exciting seeing how these games turn out.\nJust about another foot to go, old man.\nZounds.\n- The electricity's gone off.\n- Sure!\nI knew something would spoil it.\nNow, now, Uncle Fester, it only missed you by a hair.\nNo matter, I'll get another fuse.\nNo, forget it.\nI'm out of the mood now.\nI'll be up in my tree house.\nGomez, darling, wasn't that the doorbell?\nI believe it was.\nThat means the electricity is okay.\nIt must have been Uncle Fester who blew a fuse.\nI'll have Pugsley bring him a fresh one.