In [23]:
import re
import os
import csv
import pandas as pd
from scripts import cleaner
from scripts import entro

In [4]:
decades = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']

In [5]:
meta = {}
for decade in decades:
    path = "metadata/episodes_" + decade[:-1] + ".csv"
    meta[decade] = pd.read_csv(path, header=0, converters={'season': str, 'episode': str})

In [6]:
for decade in decades:
    curr_df = meta[decade]
    meta[decade] = curr_df.groupby(['title', 'season'])

In [3]:
pattern_all = re.compile(r'\{START OF EPISODE \d{1,2}\}(.+?)\{END OF EPISODE \d{1,2}\}', re.DOTALL)
pattern_50s = re.compile(r'\{START OF EPISODE \d{2}\}(.+?)\{END OF EPISODE \d{2}\}', re.DOTALL)

In [7]:
def obtainConditionalEntropy(decade):
    conditional = []
    for grp_idx, grp in meta[decade]:
        t_raw, season = grp_idx[0], grp_idx[1]
        if int(season) < 7:
            t_clean = re.sub(r' ', '_', t_raw.lower())
            path = "transcripts/" + decade + "/" + t_clean + "_season" + str(season) + ".txt"
            with open(path) as file:
                corpus = file.read()
            if decade == '1950s':
                episodes = re.split(pattern_50s, corpus)
            else:
                episodes = re.split(pattern_all, corpus)
            for i in range(1, len(episodes) - 1, 2):
                c = entro.conditional_entropy(episodes[i])
                idx = int((i - 1)/2)
                data = (t_raw, season, idx, c)
                conditional.append(data)
    return conditional

In [8]:
conditional_1950 = obtainConditionalEntropy('1950s')
conditional_1960 = obtainConditionalEntropy('1960s')

In [9]:
conditional_1950[-5:]

[('I Love Lucy', '06', 13, 0.0004910038802611387),
 ('I Love Lucy', '06', 14, 0.00024458230678343874),
 ('I Love Lucy', '06', 15, 0.0008600719151500931),
 ('I Love Lucy', '06', 16, 0.0007101439624727885),
 ('I Love Lucy', '06', 17, 0.001079135561577803)]

## test

In [11]:
episodes = []

for grp_idx, grp in meta['2010s']:
    t_raw, season = grp_idx[0], grp_idx[1]
    if int(season) < 7:
        t_clean = re.sub(r' ', '_', t_raw.lower())
        path = "transcripts/" + decade + "/" + t_clean + "_season" + str(season) + ".txt"

        with open(path) as file:
            corpus = file.read()

        if decade == '1950s':
            episodes = re.split(pattern_50s, corpus)
        else:
            episodes = re.split(pattern_all, corpus)

In [12]:
episodes[1]

'\n1 - - Michael designs the new neighborhood.\nThe Bad Place gets to choose the four new humans.\nThe other residents will be Janet Babies.\nYou know, if we pooled our resources, we could be done in two shakes of a lamb\'s Derek.\nThanks, Derek.\nWhen you fail, the four humans will be tortured for all eternity.\nBut guess who\'s going to be doing the torturing?\nWe built a Michael suit.\nI can\'t do it.\nIt\'s too scary.\nHi, John, I\'m the architect.\nCome on in.\nThe Bad Place didn\'t pick the worst people.\nThey picked the people who would be the worst for us.\nYou need to erase my memory and reboot me.\nHi, Chidi, I\'m Eleanor.\nCome on in.\nWow!\nJust wow!\nIt\'s perfect.\nEverything is just perfect.\nI am so, so happy.\nI\'m glad you like it.\nIt\'s so quaint it feels cozy, but also vibrant and limitless.\nI can\'t believe how how utterly, completely, 100% perfect everything is You know, let\'s keep this moving.\nOh, yeah, sure.\nHere is your new home, complete with two of your 

In [13]:
for token in episodes[1]:
    print(token)



1
 
-
 
-
 
M
i
c
h
a
e
l
 
d
e
s
i
g
n
s
 
t
h
e
 
n
e
w
 
n
e
i
g
h
b
o
r
h
o
o
d
.


T
h
e
 
B
a
d
 
P
l
a
c
e
 
g
e
t
s
 
t
o
 
c
h
o
o
s
e
 
t
h
e
 
f
o
u
r
 
n
e
w
 
h
u
m
a
n
s
.


T
h
e
 
o
t
h
e
r
 
r
e
s
i
d
e
n
t
s
 
w
i
l
l
 
b
e
 
J
a
n
e
t
 
B
a
b
i
e
s
.


Y
o
u
 
k
n
o
w
,
 
i
f
 
w
e
 
p
o
o
l
e
d
 
o
u
r
 
r
e
s
o
u
r
c
e
s
,
 
w
e
 
c
o
u
l
d
 
b
e
 
d
o
n
e
 
i
n
 
t
w
o
 
s
h
a
k
e
s
 
o
f
 
a
 
l
a
m
b
'
s
 
D
e
r
e
k
.


T
h
a
n
k
s
,
 
D
e
r
e
k
.


W
h
e
n
 
y
o
u
 
f
a
i
l
,
 
t
h
e
 
f
o
u
r
 
h
u
m
a
n
s
 
w
i
l
l
 
b
e
 
t
o
r
t
u
r
e
d
 
f
o
r
 
a
l
l
 
e
t
e
r
n
i
t
y
.


B
u
t
 
g
u
e
s
s
 
w
h
o
'
s
 
g
o
i
n
g
 
t
o
 
b
e
 
d
o
i
n
g
 
t
h
e
 
t
o
r
t
u
r
i
n
g
?


W
e
 
b
u
i
l
t
 
a
 
M
i
c
h
a
e
l
 
s
u
i
t
.


I
 
c
a
n
'
t
 
d
o
 
i
t
.


I
t
'
s
 
t
o
o
 
s
c
a
r
y
.


H
i
,
 
J
o
h
n
,
 
I
'
m
 
t
h
e
 
a
r
c
h
i
t
e
c
t
.


C
o
m
e
 
o
n
 
i
n
.


T
h
e
 
B
a
d
 
P
l
a
c
e
 
d
i
d
n
'
t
 
p
i
c
k
 
t
h
e
 
w
o
r
s
t
 
p
e
o
p
l
e
.


T
h
e
y


In [14]:
with open('test.txt') as file:
    corp = file.read()

In [16]:
for token in corp[:len(corp)-2]:
    print(token)

M
i
c
h
a
e
l


d
e
s
i
g
n
s


t
h
e


n
e
w


n
e
i
g
h
b
o
r
h
o
o
d


.




T
h
e


B
a
d


P
l
a
c
e


g
e
t
s


t
o


c
h
o
o
s
e


t
h
e


f
o
u
r


n
e
w


h
u
m
a
n
s


.




T
h
e


o
t
h
e
r


r
e
s
i
d
e
n
t
s


w
i
l
l


b
e


J
a
n
e
t


B
a
b
i
e
s
.




Y
o
u


k
n
o
w


,


i
f


w
e


p
o
o
l
e
d


o
u
r


r
e
s
o
u
r
c
e
s


,


w
e


c
o
u
l
d


b
e


d
o
n
e


i
n


t
w
o


s
h
a
k
e
s


o
f


a


l
a
m
b


'


s


D
e
r
e
k


.




T
h
a
n
k
s


,


D
e
r
e
k
.




W
h
e
n


y
o
u


f
a
i
l


,


t
h
e


f
o
u
r


h
u
m
a
n
s


w
i
l
l


b
e


t
o
r
t
u
r
e
d


f
o
r


a
l
l


e
t
e
r
n
i
t
y


In [17]:
n = []
n = entro.ngram_list(corp, 2)

In [18]:
n[-5:]

['ni', 'it', 'ty', 'y\n', '\n.']

## experiment

In [25]:
def obtainExperimentalEntropy(decade):
    conditional = []
    for grp_idx, grp in meta[decade]:
        t_raw, season = grp_idx[0], grp_idx[1]
        if int(season) < 7:
            t_clean = re.sub(r' ', '_', t_raw.lower())
            path = "transcripts/" + decade + "/" + t_clean + "_season" + str(season) + ".txt"
            with open(path) as file:
                corpus = file.read()
            if decade == '1950s':
                episodes = re.split(pattern_50s, corpus)
            else:
                episodes = re.split(pattern_all, corpus)
            for i in range(1, len(episodes) - 1, 2):
                c = entro.experimental_entropy(episodes[i])
                idx = int((i - 1)/2)
                data = (t_raw, season, idx, c)
                conditional.append(data)
    return conditional

In [26]:
experiment_1950 = obtainExperimentalEntropy('1950s')

AttributeError: module 'scripts.entro' has no attribute 'experimental_entropy'