In [80]:
import pandas as pd
import os
from IPython.display import display

import string
import re
import itertools
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julie.fisher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Grab and store the data
def read_script(file_path):
    corpus = ''
    with open(file_path, 'r', encoding='latin-1') as l:
        for line in l:
            if (re.match('[^\d+]', line)
               ) and (re.match('^(?!\s*$).+', line)
                      ) and not (re.match('(.*www.*)|(.*http:*)', line)
                                ) and not (re.match('Sync and correct*', line)):
                line = re.sub('</?i>|</?font.*>', '', line)
                corpus = corpus + ' ' + line
    return corpus

def load_files_to_dict(file_path, return_dict):    
    for thing in os.scandir(file_path):
        if thing.is_dir():
            new_path = os.path.join(file_path, thing.name)
            new_dict = return_dict[thing.name] = {}
            load_files_to_dict(new_path, new_dict)
        elif thing.is_file:
            return_dict[thing.name] = read_script(f'{file_path}/{thing.name}')
    return return_dict

def convert_dict_df(script_dict):
    return pd.DataFrame.from_dict(script_dict, orient='index').reset_index().rename(columns={'index':'script_name', 0:'corpus'})

In [4]:
# Clean the text and create ngrams
def punct_tokens(df, text_col):
    newline_list = '\t\r\n'
    remove_newline = str.maketrans(' ', ' ', newline_list)
    punct_list = string.punctuation + '-‘_”'
    nopunct = str.maketrans('', '', punct_list)
    df['no_punct_tokens'] = df[text_col].fillna("").str.lower().str.translate(remove_newline).str.translate(nopunct).str.split()
    return df

def create_ngrams(df):
    stop = nltk.corpus.stopwords.words('english')
    df['unigrams'] = df['no_punct_tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['bigrams'] = df['unigrams'].apply(lambda x:(list(nltk.bigrams(x))))
    df['trigrams'] = df['unigrams'].apply(lambda x:(list(nltk.trigrams(x))))
    return df

def create_ngram_df(script_dict, text_col):
    df = convert_dict_df(script_dict)
    df = punct_tokens(df, text_col)
    df = create_ngrams(df)
    return df

In [5]:
# Get frequency counts and normalized frequency
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 1
        else:
            freq_dict[ngram] +=1
    return freq_dict

def dict_to_df(freq_dict, gram_name, corpus_name):
    if (type(gram_name)==str) and (type(corpus_name)==str):
        pass
    else:
        print('gram and corpus variables must be strings')
    freq_colname = corpus_name+'_frequency'
    df = pd.DataFrame.from_dict(freq_dict, orient='index'
                               ).reset_index().rename(columns={'index':gram_name, 0:freq_colname}
                                                     ).sort_values(freq_colname, ascending=False)
    return df

def normalized_freq(freq_df, corpus_name):
    freq_col_name = corpus_name + '_frequency'
    norm_col_name = corpus_name + '_norm_freq'
    total_ct = freq_df[freq_col_name].sum()
    freq_df[norm_col_name] = freq_df[freq_col_name]/total_ct
    return freq_df

def create_frequencies(ngram_list, gram_name, corpus_name):
    freq_dict = frequency_ct(ngram_list)
    freq_df = dict_to_df(freq_dict, gram_name, corpus_name)
    freq_df = normalized_freq(freq_df, corpus_name)
    return freq_df

In [None]:
# Compare test corpus to authentic corpus and rank corpora

def combine_test_authentic(test_freq_dict, authentic_freq):
    compare_dict = {}
    for script_group in list(test_freq_dict.keys()):
        df = test_freq_dict[script_group].merge(authentic_freq, on='unigram', how='outer').fillna(0)
        freq_cols = df.columns[df.columns.str.contains('norm_freq')]
        df['norm_freq_ratio'] = df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[0]] / df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[1]]
        compare_dict[script_group] = df
    return compare_dict

def get_ranking(compare_dict):
    results = pd.DataFrame(columns = ['script', 'high_ratio', 'low_ratio'])
    for script_group in compare_dict.keys():
        results = results.append(
            {'script':script_group,
             'high_ratio':compare_dict[script_group].sort_values('norm_freq_ratio', ascending=False).head(50)['norm_freq_ratio'].sum(),
             'low_ratio':compare_dict[script_group].sort_values('norm_freq_ratio').head(50)['norm_freq_ratio'].sum()
            }, ignore_index=True)
        results['combined_score'] = results['high_ratio'] - results['low_ratio']
        results = results.sort_values('combined_score')
        results['rank'] = range(1, 1+len(results))
        return results

# Read in file

The standard and very basic way to read in a file is to use `open()`. When using this function I'll read in the file line by line. This allows me to remove lines that aren't pertinent to the analysis before having to store them in memory.

In [6]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        print(line, end='')

0
00:00:01,000 --> 00:00:04,000
Downloaded From www.AllSubs.org

1
00:00:00,105 --> 00:00:05,000
Shared by http://DJJ.HOME.SAPO.PT/

1
00:00:11,378 --> 00:00:13,880
You're traveling
through another dimension-

2
00:00:13,915 --> 00:00:17,017
a dimension not only of sight
and sound, but of mind,

3
00:00:17,052 --> 00:00:18,885
a journey into a wondrous land

4
00:00:18,920 --> 00:00:20,770
whose boundaries
are that of imagination.

5
00:00:20,805 --> 00:00:23,273
Your next stop,
the twilight zone.

6
00:00:57,057 --> 00:00:58,308
She's all set,
mr. Radin.

7
00:00:58,343 --> 00:01:00,176
How about the
sound system?

8
00:01:00,211 --> 00:01:01,428
You check that out?

9
00:01:01,463 --> 00:01:02,679
She's all
ready to go.

10
00:01:02,714 --> 00:01:05,815
I don't know where you
got your sound effects

11
00:01:05,850 --> 00:01:08,318
but you'd swear
a bomb was exploding.

12
00:01:08,935 --> 00:01:10,186
I mean a big bomb.

13
00:01:10,221 --> 00:01:12,689
That's precisely the way
it's

# Remove lines/characters

In the case of the provided scripts, the directions list the following things to remove:

- Any line that contains only numeric characters ('0', '1', ... '9')
- Any line that contains the string '-->'
- Blank lines
- The following chracters/character groups:
    - '\<i>'
    - '\</i>'
    - '\<font color=#00FF00>'
    - '\<font color=#00FFFF>'
    - '\<font color="#00ff00">'
    - '\<font color="#ff0000">'
    - '\</font>'
    - '#'
    - '-'
    - '('
    - ')'
    - '\xe2\x99\xaa'
    - 'www.AllSubs.org'
    - 'http://DJJ.HOME.SAPO.PT/'
    - 'Downloaded'
    - 'Shared'
    - 'Sync'
    - 'www.addic7ed.com'
    - 'n17t01'
    - '"'
    - '\n'
    
I'll be using the regex python library `re` to handle finding these things. Most of the regex patterns and logic used in this notebook can be found on the cheatsheet of [regexr.com](https://regexr.com/).
    
Looking at the initial sample I loaded in, the reason for the first three items is easily apparent;

- Each line block is numbered
- The second line of the block is two timestamps separated by '-->'
    - A quick internet search reveals that the `srt` file format is one of the most common file formats for subtitling and/or captioning. The timestamps are the start and stop times of the lines that follow within the line block
- There is an empty line between each line block

In [7]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        print(line, end='')

0
00:00:01,000 --> 00:00:04,000
Downloaded From www.AllSubs.org

1
00:00:00,105 --> 00:00:05,000
Shared by http://DJJ.HOME.SAPO.PT/

1
00:00:11,378 --> 00:00:13,880
You're traveling
through another dimension-

2
00:00:13,915 --> 00:00:17,017
a dimension not only of sight
and sound, but of mind,

3
00:00:17,052 --> 00:00:18,885
a journey into a wondrous land

4
00:00:18,920 --> 00:00:20,770
whose boundaries
are that of imagination.

5
00:00:20,805 --> 00:00:23,273
Your next stop,
the twilight zone.

6
00:00:57,057 --> 00:00:58,308
She's all set,
mr. Radin.

7
00:00:58,343 --> 00:01:00,176
How about the
sound system?

8
00:01:00,211 --> 00:01:01,428
You check that out?

9
00:01:01,463 --> 00:01:02,679
She's all
ready to go.

10
00:01:02,714 --> 00:01:05,815
I don't know where you
got your sound effects

11
00:01:05,850 --> 00:01:08,318
but you'd swear
a bomb was exploding.

12
00:01:08,935 --> 00:01:10,186
I mean a big bomb.

13
00:01:10,221 --> 00:01:12,689
That's precisely the way
it's

Fortunately, the first two things can be identified using a single line of regex. The regex pattern '\d+' looks for patterns that only include digits (i.e. no letters).

In [8]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('\d+', line):
            print(line, end='')

0
00:00:01,000 --> 00:00:04,000
1
00:00:00,105 --> 00:00:05,000
1
00:00:11,378 --> 00:00:13,880
2
00:00:13,915 --> 00:00:17,017
3
00:00:17,052 --> 00:00:18,885
4
00:00:18,920 --> 00:00:20,770
5
00:00:20,805 --> 00:00:23,273
6
00:00:57,057 --> 00:00:58,308
7
00:00:58,343 --> 00:01:00,176
8
00:01:00,211 --> 00:01:01,428
9
00:01:01,463 --> 00:01:02,679
10
00:01:02,714 --> 00:01:05,815
11
00:01:05,850 --> 00:01:08,318
12
00:01:08,935 --> 00:01:10,186
13
00:01:10,221 --> 00:01:12,689
14
00:01:33,843 --> 00:01:35,712
15
00:01:35,747 --> 00:01:36,963
16
00:01:36,998 --> 00:01:38,848
17
00:01:38,883 --> 00:01:40,717
18
00:01:41,351 --> 00:01:43,219
19
00:01:43,254 --> 00:01:45,105
20
00:01:45,140 --> 00:01:46,356
21
00:01:46,391 --> 00:01:49,476
22
00:01:49,511 --> 00:01:52,612
23
00:01:52,647 --> 00:01:55,115
24
00:01:55,150 --> 00:01:56,366
25
00:01:56,401 --> 00:01:58,868
26
00:01:58,903 --> 00:02:00,120
27
00:02:00,155 --> 00:02:01,988
28
00:02:02,023 --> 00:02:03,873
29
00:02:03,908 --> 0

Now that I've verified that the regex pattern only identifies the lines I'm interested in, I can use it to remove those lines by returning the ones that don't match the criteria.

In [9]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if not re.match('\d+', line):
            print(line, end='')

Downloaded From www.AllSubs.org

Shared by http://DJJ.HOME.SAPO.PT/

You're traveling
through another dimension-

a dimension not only of sight
and sound, but of mind,

a journey into a wondrous land

whose boundaries
are that of imagination.

Your next stop,
the twilight zone.

She's all set,
mr. Radin.

How about the
sound system?

You check that out?

She's all
ready to go.

I don't know where you
got your sound effects

but you'd swear
a bomb was exploding.

I mean a big bomb.

That's precisely the way
it's supposed to sound.

That about do it,
mr. Radin?

That about does it.

You got quite
a setup here.

This part of
the illusion too?

No, this room is
not an illusion.

I venture to guess

that it's the best
designed bomb shelter

on the face of the
earth- who knows?

The hydrogen bomb
is not an illusion.

But tonight it's
for gags, huh?

Something of the sort.

A practical joke,
let's say.

You can say
that again.

When they start
those sound effects

and that stuff
on the screen

Alternately, instead of excluding the lines using `not` as part of the `if` statement, I can also exclude those lines using the regex pattern by changing it to a set that includes the `^` not character. 

In [10]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('[^\d+]', line):
            print(line, end='')

Downloaded From www.AllSubs.org

Shared by http://DJJ.HOME.SAPO.PT/

You're traveling
through another dimension-

a dimension not only of sight
and sound, but of mind,

a journey into a wondrous land

whose boundaries
are that of imagination.

Your next stop,
the twilight zone.

She's all set,
mr. Radin.

How about the
sound system?

You check that out?

She's all
ready to go.

I don't know where you
got your sound effects

but you'd swear
a bomb was exploding.

I mean a big bomb.

That's precisely the way
it's supposed to sound.

That about do it,
mr. Radin?

That about does it.

You got quite
a setup here.

This part of
the illusion too?

No, this room is
not an illusion.

I venture to guess

that it's the best
designed bomb shelter

on the face of the
earth- who knows?

The hydrogen bomb
is not an illusion.

But tonight it's
for gags, huh?

Something of the sort.

A practical joke,
let's say.

You can say
that again.

When they start
those sound effects

and that stuff
on the screen

With those lines removed, it's easy to see why "Downloaded", "www.AllSubs.org", "Shared", and "http://DJJ.HOME.SAPO.PT/" are on the list of character groups to remove. However, blank lines are thrid on the list, so I'm going to get rid of those first.

I admit, I had to search the internet for the [regex pattern to remove blank lines](https://www.codegrepper.com/code-examples/javascript/regex+empty+line). The pattern is the unintuitive `^(?!\s*$).+`.

- `^` indicates the start of the string (when included as part of a set it means `not`, see the digit removal above for an example)
- `()` denotes a group
- `?!` is a negative lookahead 
- `\s` identified white space
- `*` matches 0 or more of the charcter it follows
- `$` indicates the end of the string
- `.` is a wildcard, it matches any character except for newline
- `+` matches 1 or more of the character it follows

Testing it on the sample file, I can confirm it does exactly what's needed.

In [11]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('^(?!\s*$).+', line):
            print(line, end='')

0
00:00:01,000 --> 00:00:04,000
Downloaded From www.AllSubs.org
1
00:00:00,105 --> 00:00:05,000
Shared by http://DJJ.HOME.SAPO.PT/
1
00:00:11,378 --> 00:00:13,880
You're traveling
through another dimension-
2
00:00:13,915 --> 00:00:17,017
a dimension not only of sight
and sound, but of mind,
3
00:00:17,052 --> 00:00:18,885
a journey into a wondrous land
4
00:00:18,920 --> 00:00:20,770
whose boundaries
are that of imagination.
5
00:00:20,805 --> 00:00:23,273
Your next stop,
the twilight zone.
6
00:00:57,057 --> 00:00:58,308
She's all set,
mr. Radin.
7
00:00:58,343 --> 00:01:00,176
How about the
sound system?
8
00:01:00,211 --> 00:01:01,428
You check that out?
9
00:01:01,463 --> 00:01:02,679
She's all
ready to go.
10
00:01:02,714 --> 00:01:05,815
I don't know where you
got your sound effects
11
00:01:05,850 --> 00:01:08,318
but you'd swear
a bomb was exploding.
12
00:01:08,935 --> 00:01:10,186
I mean a big bomb.
13
00:01:10,221 --> 00:01:12,689
That's precisely the way
it's supposed to s

Now that I've got the first three big items knocked off the list, I can start addressing the special characters/character groups to remove.

I'm going to start with the ones jumping out at me: "Downloaded", "www.AllSubs.org", "Shared", and "http://DJJ.HOME.SAPO.PT/".

A quick internet search shows that the internet wasn't really a thing until 1969, and then only in its infancy. The world wide web (www) wasn't invented until 1989 and not released to the public until 1993. All of which means that any lines including "www." or "http:" are metadata that I need to remove (if that isn't true, then the script has bigger problems that incorrect vocabulary).

In [12]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('(.*www.*)|(.*http:*)', line):
            print(line, end='')

Downloaded From www.AllSubs.org
Shared by http://DJJ.HOME.SAPO.PT/
Downloaded From www.AllSubs.org


This single regex line knocks quite a few items off the list:

- 'www.AllSubs.org'
- 'http://DJJ.HOME.SAPO.PT/'
- 'Downloaded'
- 'Shared'
- 'www.addic7ed.com'

Knocking out 5 items at once feels good, so next I'll tackle another big chunk that can be easily identified using regex:

- '\<i>'
- '\</i>'
- '\<font color=#00FF00>'
- '\<font color=#00FFFF>'
- '\<font color="#00ff00">'
- '\<font color="#ff0000">'
- '\</font>'

However, to see any of them, I have to switch my sample file. I switched to one of the Pan Am scripts. Looking at the raw file, the `<i>` pattern is readily available in the first line block.

In [13]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        print(line, end='')

﻿1
00:00:01,461 --> 00:00:02,729
<i>Previously on "Pan Am"...</i>

2
00:00:02,796 --> 00:00:06,064
Let's keep it in New York,
Ginny. Monte Carlo was a lark.

3
00:00:06,348 --> 00:00:08,949
It's likely to be
a long trip.

4
00:00:09,016 --> 00:00:10,083
I'm doing what I must.

5
00:00:10,151 --> 00:00:12,419
Good-bye, Kate.

6
00:00:12,486 --> 00:00:15,555
How old were you
when the Nazis occupied France?

7
00:00:15,622 --> 00:00:17,257
I was 3.

8
00:00:17,925 --> 00:00:19,692
I'm flying.

9
00:00:19,760 --> 00:00:21,461
<i>I</i> am in charge in the air,

10
00:00:21,528 --> 00:00:24,764
and what <i>I</i> say
and what <i>I</i> do is the law.

11
00:00:24,832 --> 00:00:26,466
And a sky God is born!

12
00:00:26,533 --> 00:00:28,601
Hey, call it
whatever you want.

13
00:00:28,669 --> 00:00:31,103
You either accept it
or you need to get another jet.

14
00:00:33,282 --> 00:00:36,182
<i>Attention passengers
of Flight 296 to London, Heathrow.</i>

15
00:00:36,276 --> 00:00:39,411
That hur

In [14]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('</?i>|</?font.*>', line):
            print(line, end='')

<i>Previously on "Pan Am"...</i>
<i>I</i> am in charge in the air,
<i>Attention passengers
<i>L'aéroport est fermé.</i>
<i>Nous ne pouvons pas...</i>
<i>Port-au-Prince,</i>
<i>nous avons un passager mourant.</i>
<i>Vous n'avez pas l'autorisation.</i>
<i>- Il n'y a personne</i>
<i>à la tour de contrôle</i>
<i>pour vous guider.</i>
<i>Il doit y avoir quelqu'un!</i>
<i>C'est une urgence.</i>
<i>Visuelle pour la piste dix.</i>
<i>We'll be back</i>
<i>on our way to Caracas.</i>
<i>Cabin crew, please prepare</i>
<i>for landing.</i>
<i>Nous allon d'atterrir.</i>
<i>Ou sont les lumieres?</i>
<i>Docteur?</i>
<i>Les lignes sont coupees.</i>
<i>Il n'y a plus d'electricite</i>
<i>sur l'ile.</i>
<i>Nous avons un homme tres malade</i>
<i>a bord. Ou trouver un docteur?</i>
<i>Petionville est le village</i>
<i>le plus pres avec un docteur,</i>
<i>mais les routes</i>
<i>sont trop dangereuses.</i>
<i>C'est une tuerie.</i>
<i>Americains?</i>
<i>Merci.</i>
<i>Avez-vous</i>
<i>de la nourriture?</i>
<i>N'av

Unfortunately, there are no examples of `<font>`. But if the generic pattern works for `<i>` it should also work for `<font>`.

Unlike the lines before which were entirely metadata, the words in these lines should be included in the analysis. As such, I don't want to remove the lines entirely, just the specific group of characters I specify. To do this, I switch from `re.match` to `re.sub`. 

In [15]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        line = re.sub('</?i>|</?font.*>', '', line)
        print(line, end='')

﻿1
00:00:01,461 --> 00:00:02,729
Previously on "Pan Am"...

2
00:00:02,796 --> 00:00:06,064
Let's keep it in New York,
Ginny. Monte Carlo was a lark.

3
00:00:06,348 --> 00:00:08,949
It's likely to be
a long trip.

4
00:00:09,016 --> 00:00:10,083
I'm doing what I must.

5
00:00:10,151 --> 00:00:12,419
Good-bye, Kate.

6
00:00:12,486 --> 00:00:15,555
How old were you
when the Nazis occupied France?

7
00:00:15,622 --> 00:00:17,257
I was 3.

8
00:00:17,925 --> 00:00:19,692
I'm flying.

9
00:00:19,760 --> 00:00:21,461
I am in charge in the air,

10
00:00:21,528 --> 00:00:24,764
and what I say
and what I do is the law.

11
00:00:24,832 --> 00:00:26,466
And a sky God is born!

12
00:00:26,533 --> 00:00:28,601
Hey, call it
whatever you want.

13
00:00:28,669 --> 00:00:31,103
You either accept it
or you need to get another jet.

14
00:00:33,282 --> 00:00:36,182
Attention passengers
of Flight 296 to London, Heathrow.

15
00:00:36,276 --> 00:00:39,411
That hurricane's stepping
on everything hea

Voilà, another 7 items knocked off the list.

Of the original items from the characters/character group list, I'm left with:

- '#'
- '-'
- '('
- ')'
- '\xe2\x99\xaa'
- 'Sync'
- 'n17t01'
- '"'
- '\n'

Of these, most are special characters (#, -, (, ), \n, "). I'm going to handle these in another part of the analysis where I can remove all special characters at the same time (also including characters like !, ., ?, etc).

The '\xe2\x99\xaa' item is a character encoding problem. This homework lesson must have been created before improved character encoding handling because this series of characters didn't show up in my earilier unigram analysis. I'm going to assume the 'latin-1' encoding required to process all the files (default 'utf-8' handles most, but not all) appropriately handles this.

That leaves me with  'Sync' and 'n17t01'. I wasn't able to find the file(s) that include 'n17t01', but the Pan Am sample has 'Sync'.

In [16]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('Sync', line):
            print(line, end='')

Sync and corrected by dr.jackson
Sync and corrected by dr.jackson


Since this appears to be metadata, I'm going to remove the whole line so that the other words aren't included in the analysis either. A quick internet search indicates that 'n17t01' also tends to be included in a line 'Sync and corrections by n17t01'.

To ensure I don't remove all lines with 'sync', inadventently removing lines that should be analyzed, I'll search for a partial phrase. The pattern 'Sync and correct*' should match both "Sync and corrected by" and "Sync and corrections by", which should remove all the 'sync' lines I don't want while leaving any lines that include 'sync' as part of the text.

In [17]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        if re.match('Sync and correct*', line):
            print(line, end='')

Sync and corrected by dr.jackson
Sync and corrected by dr.jackson


This concludes the pattern matching section. Now all I have to do is put it all together:

In [18]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
with open(test_file, 'r') as l:
    for line in l:
        if (re.match('[^\d+]', line)
           ) and (re.match('^(?!\s*$).+', line)
                  ) and not (re.match('(.*www.*)|(.*http:*)', line)
                            ) and not (re.match('Sync and correct*', line)):
            line = re.sub('</?i>|</?font.*>', '', line)
            print(line, end='')

You're traveling
through another dimension-
a dimension not only of sight
and sound, but of mind,
a journey into a wondrous land
whose boundaries
are that of imagination.
Your next stop,
the twilight zone.
She's all set,
mr. Radin.
How about the
sound system?
You check that out?
She's all
ready to go.
I don't know where you
got your sound effects
but you'd swear
a bomb was exploding.
I mean a big bomb.
That's precisely the way
it's supposed to sound.
That about do it,
mr. Radin?
That about does it.
You got quite
a setup here.
This part of
the illusion too?
No, this room is
not an illusion.
I venture to guess
that it's the best
designed bomb shelter
on the face of the
earth- who knows?
The hydrogen bomb
is not an illusion.
But tonight it's
for gags, huh?
Something of the sort.
A practical joke,
let's say.
You can say
that again.
When they start
those sound effects
and that stuff
on the screen
you'd swear the world
was getting blasted.
That's the idea.
I have three guests
coming this eve

In [19]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt'
with open(test_file, 'r') as l:
    for line in l:
        if not (re.match('[^\d+]', line)
           ) and (re.match('^(?!\s*$).+', line)
                  ) and not (re.match('(.*www.*)|(.*http:*)', line)
                            ) and not (re.match('Sync and correct*', line)):
            line = re.sub('</?i>|</?font.*>', '', line)
            print(line, end='')

00:00:01,461 --> 00:00:02,729
2
00:00:02,796 --> 00:00:06,064
3
00:00:06,348 --> 00:00:08,949
4
00:00:09,016 --> 00:00:10,083
5
00:00:10,151 --> 00:00:12,419
6
00:00:12,486 --> 00:00:15,555
7
00:00:15,622 --> 00:00:17,257
8
00:00:17,925 --> 00:00:19,692
9
00:00:19,760 --> 00:00:21,461
10
00:00:21,528 --> 00:00:24,764
11
00:00:24,832 --> 00:00:26,466
12
00:00:26,533 --> 00:00:28,601
13
00:00:28,669 --> 00:00:31,103
14
00:00:33,282 --> 00:00:36,182
15
00:00:36,276 --> 00:00:39,411
16
00:00:39,480 --> 00:00:40,412
17
00:00:40,480 --> 00:00:41,379
18
00:00:41,447 --> 00:00:43,482
19
00:00:43,549 --> 00:00:45,283
20
00:00:45,351 --> 00:00:47,018
21
00:00:47,086 --> 00:00:48,987
22
00:00:49,054 --> 00:00:50,889
23
00:00:50,957 --> 00:00:52,557
24
00:00:53,759 --> 00:00:56,927
25
00:00:56,995 --> 00:00:57,928
26
00:00:57,996 --> 00:00:58,929
27
00:01:00,164 --> 00:01:02,566
28
00:01:02,634 --> 00:01:04,468
29
00:01:04,536 --> 00:01:06,871
30
00:01:06,938 --> 00:01:08,405
31
00:01:08,473 --> 0

Interestingly, the first numeric only line in the Pam Am file wasn't removed. Since the regex pattern handles all the others correctly, I'm going to leave this artifact for the time being. The `\d+` pattern doesn't include newline characters `\n`, which is the only explaination I can think of for it to remain while the others have been removed.

# Store data

Now that I can read in only the lines I want, I need to store the results in some kind of data structure. The structures I'd consider for this exercise are lists, concatenated strings, a dictionary, or a dataframe.

Having read through the full homework instructions, the bigram processing is supposed to treat all lines within a script as one corpus, but not lines between scripts. So Twilight episode 1 has to be processed separately from Twilight episode 2. This means each script must be processed into n-grams before combining all the results to grab n-gram frequencies.

For the first part (treat all lines within a script as one corpus), a concatenated string is the best option. This allows me to do all processing (like removing special characters, transforming all words to lowercase,  removing stopwords, and transforming to the appropriate n-gram) at the same time.

The second part (don't create n-grams between scripts), means I have to store each script's corpus separately, then combine them once they've been transformed into n-grams.

My prefered method of doing this is to use a dictionary. Although not my favorite when I need to look up or sort by both the key and the value (i.e. I won't be using dictionaries for the frequency/analysis part of this homework), the dictionary is a very versitle data structure that can hold other data structures. For this exercise, I'll store the individual script corpora as values in the dictionary.

One of the major benefits of this method is the ability to name another data structure for reference (in this case the corpus from the Dr Strangelove script would be named "Dr.Strangelove". However, the majority of the script names are a problem (I refuse to type something as long as "The Twilight Zone - 2x03 - Nervous Man in a Four Dollar Room" to access a particular item), but if I automate the handling, I shouldn't have to manually enter the script name. However, if I need to know the name of a particular script, I can still look it up.

The first step of this is to store the lines as a single text corpus.

In [20]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
corpus = ''
with open(test_file, 'r', encoding='latin-1') as l:
    for line in l:
        if (re.match('[^\d+]', line)
           ) and (re.match('^(?!\s*$).+', line)
                  ) and not (re.match('(.*www.*)|(.*http:*)', line)
                            ) and not (re.match('Sync and correct*', line)):
            line = re.sub('</?i>|</?font.*>', '', line)
            corpus = corpus + ' ' + line
corpus[:1000]

" You're traveling\n through another dimension-\n a dimension not only of sight\n and sound, but of mind,\n a journey into a wondrous land\n whose boundaries\n are that of imagination.\n Your next stop,\n the twilight zone.\n She's all set,\n mr. Radin.\n How about the\n sound system?\n You check that out?\n She's all\n ready to go.\n I don't know where you\n got your sound effects\n but you'd swear\n a bomb was exploding.\n I mean a big bomb.\n That's precisely the way\n it's supposed to sound.\n That about do it,\n mr. Radin?\n That about does it.\n You got quite\n a setup here.\n This part of\n the illusion too?\n No, this room is\n not an illusion.\n I venture to guess\n that it's the best\n designed bomb shelter\n on the face of the\n earth- who knows?\n The hydrogen bomb\n is not an illusion.\n But tonight it's\n for gags, huh?\n Something of the sort.\n A practical joke,\n let's say.\n You can say\n that again.\n When they start\n those sound effects\n and that stuff\n on the sc

This is the same code from before, but instead of printing the line, we store it in a string. Make note of the extra space I add when adding a new line to the string. If this is left out, the lines run into each other, which will negatively impact the analysis.

For example, as can be seen below, instead of getting 'traveling' and 'through' as separate words, then would be combined once the '\n' character is removed: 'travelingthrough'.

In [21]:
test_file = '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt'
corpus = ''
with open(test_file, 'r', encoding='latin-1') as l:
    for line in l:
        if (re.match('[^\d+]', line)
           ) and (re.match('^(?!\s*$).+', line)
                  ) and not (re.match('(.*www.*)|(.*http:*)', line)
                            ) and not (re.match('Sync and correct*', line)):
            line = re.sub('</?i>|</?font.*>', '', line)
            corpus = corpus + line
corpus[:1000]

"You're traveling\nthrough another dimension-\na dimension not only of sight\nand sound, but of mind,\na journey into a wondrous land\nwhose boundaries\nare that of imagination.\nYour next stop,\nthe twilight zone.\nShe's all set,\nmr. Radin.\nHow about the\nsound system?\nYou check that out?\nShe's all\nready to go.\nI don't know where you\ngot your sound effects\nbut you'd swear\na bomb was exploding.\nI mean a big bomb.\nThat's precisely the way\nit's supposed to sound.\nThat about do it,\nmr. Radin?\nThat about does it.\nYou got quite\na setup here.\nThis part of\nthe illusion too?\nNo, this room is\nnot an illusion.\nI venture to guess\nthat it's the best\ndesigned bomb shelter\non the face of the\nearth- who knows?\nThe hydrogen bomb\nis not an illusion.\nBut tonight it's\nfor gags, huh?\nSomething of the sort.\nA practical joke,\nlet's say.\nYou can say\nthat again.\nWhen they start\nthose sound effects\nand that stuff\non the screen\nyou'd swear the world\nwas getting blasted.\

This processing needs to be done to every single file provided by the instructor. The best way to do this is to turn the code into a function. Now that I have all of the pieces to process the text, I'm ready to create my first function.

*Side note*. This function is different than the one I created in the earlier homework8 notebooks (I hadn't read the full homework assignment and understood the downstream reprecussions discussed above re: unigram vs bigram handling. Also, I was trying to complete the assignment without using the special character and stopword handling from the nltk library).

In [22]:
def read_script(file_path):
    corpus = ''
    with open(file_path, 'r', encoding='latin-1') as l:
        for line in l:
            if (re.match('[^\d+]', line)
               ) and (re.match('^(?!\s*$).+', line)
                      ) and not (re.match('(.*www.*)|(.*http:*)', line)
                                ) and not (re.match('Sync and correct*', line)):
                line = re.sub('</?i>|</?font.*>', '', line)
                corpus = corpus + ' ' + line
    return corpus

# Load all files in a directory

Next I need the ability to load all files within a directory.

My original understanding of the assignment was that I was supposed to compare all scripts in the '1960s' directory with all the scripts in the '21st-century' directory. However, having re-read the later portion of the assignment, my understanding is that the purpose is to compare all scripts in the '1960s' directory individually against the different shows/movie in the '21st-century' directory.

Listing out all files within a directory is pretty straight forward with the `os` library.

- `scandir` scans the directory
- `path.join` joins parts of the file path so that handling is operating system agnostic
    - Mac uses '/'
    - PC uses '\'
- `getcwd` gets the current working directory
    - This allows the file structure to be independent of the upper file structure
    - This independence allows the .py script to be transferable between computers and/or users

In [23]:
# file_path = 
for thing in os.scandir(os.path.join(os.getcwd(), 'homework8', '1960s')):
    print(thing.name)

The Twilight Zone - 3x17 - One More Pallbearer.srt
The Twilight Zone - 3x05 - A Game of Pool.srt
The Twilight Zone - 2x03 - Nervous Man in a Four Dollar Room.srt
The Twilight Zone - 4x05 - Mute.srt
The Twilight Zone - 3x04 - The Passersby.srt
The Twilight Zone - s05e01 - In Praise of Pip.srt
The Twilight Zone - 4x13 - The New Exhibit.srt
The Twilight Zone - 3x21 - Kick the Can.srt
The Twilight Zone - 3x30 - Hocus-Pocus and Frisby.srt
The Twilight Zone - 3x16 - Nothing in the Dark.srt
The Twilight Zone - s05e12 - Ninety Years Without Slumbering.srt
The Twilight Zone - 2x08 - The Lateness of the Hour.srt
The Twilight Zone - 4x11 - The Parallel.srt
The Twilight Zone - s05e35 - The Fear.srt
The Twilight Zone - 2x09 - The Trouble with Templeton.srt
The Twilight Zone - s05e09 - Probe 7, Over and Out.srt
The Twilight Zone - s05e03 - Nightmare at 20,000 Feet.srt
The Twilight Zone - 4x16 - On Thursday We Leave for Home.srt
The Twilight Zone - 3x36 - Cavender Is Coming.srt
The Twilight Zone - 4x

To load the data for the 1960s scripts I can do something like this:

In [24]:
file_path = os.path.join(os.getcwd(), 'homework8', '1960s')
test_dict = {}
for thing in os.scandir(file_path):
    test_dict[thing.name] = read_script(f'{file_path}/{thing.name}')

Now I verify that the data loaded as expected: name of the script as the key, concatenated string as the value (yes, I absolutely copy and pasted the name of the first script to get its corpus).

In [25]:
list(test_dict.keys())[:5]

['The Twilight Zone - 3x17 - One More Pallbearer.srt',
 'The Twilight Zone - 3x05 - A Game of Pool.srt',
 'The Twilight Zone - 2x03 - Nervous Man in a Four Dollar Room.srt',
 'The Twilight Zone - 4x05 - Mute.srt',
 'The Twilight Zone - 3x04 - The Passersby.srt']

In [26]:
test_dict['The Twilight Zone - 3x17 - One More Pallbearer.srt'][:500]

" You're traveling\n through another dimension-\n a dimension not only of sight\n and sound, but of mind,\n a journey into a wondrous land\n whose boundaries\n are that of imagination.\n Your next stop,\n the twilight zone.\n She's all set,\n mr. Radin.\n How about the\n sound system?\n You check that out?\n She's all\n ready to go.\n I don't know where you\n got your sound effects\n but you'd swear\n a bomb was exploding.\n I mean a big bomb.\n That's precisely the way\n it's supposed to sound.\n That about do it,\n mr"

The challenge is in the '21st-century' directory, it contains four directories. The homework instructions say to accept a directory and just load the files within it (so I'd have to run the script four times to analyze all the corpora), but since I'm going off script on an extended detour to improve the analysis, I want to be able to run this thing once and get the analyses for all four test corpora.

What I need is a nested dictionary.

Psudo code:
>{</br>
>Mad_Men:{script_1: corpus_1, script_2: corpus_2, ..., script_n: corpus_n},</br>
>Pan_Am: {script_1: corpus_1, script_2: corpus_2, ..., script_n: corpus_n},</br>
The_Kennedys: {script_1: corpus_1, script_2: corpus_2, ..., script_n: corpus_n},</br>
X-Men_First_Class: {script1: corpus1}</br>
>}

In my previous solution, I used recursion to work through any subdirectories until I grabbed the file paths for all files the specified directory and its subdirectories then returned that as a list.

In [27]:
def list_filepaths(append_list, file_path = os.getcwd()):
    for thing in os.scandir(file_path):
        if thing.is_dir():
            new_path = os.path.join(file_path, thing.name)
            list_filepaths(append_list, new_path)
        elif thing.is_file:
            append_list.append(f'{file_path}/{thing.name}')
    return append_list

I improved upon that concept when I added the `read_script` function to the example code:

```
file_path = os.path.join(os.getcwd(), 'homework8', '1960s')
test_dict = {}
for thing in os.scandir(file_path):
    test_dict[thing.name] = read_script(f'{file_path}/{thing.name}')
```

A few minor edits to the original recursive function gives me what I want. [This StackOverflow answer](https://stackoverflow.com/a/48382262) gave me the key I needed to recursively create the dictionary structure: add the dictionary as a parameter, then create a new level to feed to the recursion:

In [32]:
def load_files_to_dict(file_path, return_dict):    
    for thing in os.scandir(file_path):
        if thing.is_dir():
            new_path = os.path.join(file_path, thing.name)
            new_dict = return_dict[thing.name] = {}
            load_files_to_dict(new_path, new_dict)
        elif thing.is_file:
            return_dict[thing.name] = read_script(f'{file_path}/{thing.name}')
    return return_dict

In [29]:
file_path = os.path.join(os.getcwd(), 'homework8', '1960s')
unilayer_dict = load_files_to_dict(file_path, {})

In [30]:
print(list(unilayer_dict.keys())[:5]) # First 5 scripts in the 1960s directory
print('\n')
print(list(unilayer_dict.keys())[0], 'sample text:') # Name of the first script in the list
print(unilayer_dict[list(unilayer_dict.keys())[0]][:500]) # 500 character sample of the first script

['The Twilight Zone - 3x17 - One More Pallbearer.srt', 'The Twilight Zone - 3x05 - A Game of Pool.srt', 'The Twilight Zone - 2x03 - Nervous Man in a Four Dollar Room.srt', 'The Twilight Zone - 4x05 - Mute.srt', 'The Twilight Zone - 3x04 - The Passersby.srt']


The Twilight Zone - 3x17 - One More Pallbearer.srt sample text:
 You're traveling
 through another dimension-
 a dimension not only of sight
 and sound, but of mind,
 a journey into a wondrous land
 whose boundaries
 are that of imagination.
 Your next stop,
 the twilight zone.
 She's all set,
 mr. Radin.
 How about the
 sound system?
 You check that out?
 She's all
 ready to go.
 I don't know where you
 got your sound effects
 but you'd swear
 a bomb was exploding.
 I mean a big bomb.
 That's precisely the way
 it's supposed to sound.
 That about do it,
 mr


In [33]:
file_path = os.path.join(os.getcwd(), 'homework8', '21st-century')
bilayer_dict = load_files_to_dict(file_path, {})

In [34]:
print(list(bilayer_dict.keys())) # List of the subdirectories in the 21st-century directory
print(list(bilayer_dict.keys())[0]) # Name of the first subdirectory in the list
print(list(bilayer_dict[list(bilayer_dict.keys())[0]].keys())) # List of the scripts of the first subdirectory within the subdirectory list
print('\n')
print(list(bilayer_dict[list(bilayer_dict.keys())[0]].keys())[0], 'sample text')
print(bilayer_dict[list(bilayer_dict.keys())[0]][list(bilayer_dict[list(bilayer_dict.keys())[0]].keys())[0]][:500]) # 500 character sample of the script

['Pan_Am', 'Mad_Men', 'X-Men_First_Class', 'The_Kennedys']
Pan_Am
['Pan.Am.S01E09.srt', 'Pan.Am.S01E08.srt', 'Pan.Am.S01E05.srt', 'Pan.Am.S01E11.srt', 'Pan.Am.S01E10.srt', 'Pan.Am.S01E04.srt', 'Pan.Am.S01E12.srt', 'Pan.Am.S01E06.srt', 'Pan.Am.S01E07.srt', 'Pan.Am.S01E13.srt', 'Pan.Am.S01E03.srt', 'Pan.Am.S01E02.srt', 'Pan.Am.S01E14.srt', 'Pan.Am.S01E01.srt']


Pan.Am.S01E09.srt sample text
 Previously on "Pan Am"...
 Look, I get to see the world,
 Sam.
 When was the last time
 you left the village?
 I don't need to see the world
 to change it.
 - Marry me!
 - I can't say yes now.
 Pan Am stewardess can travell all
 around the world without suspicion.
 You volunteered for this.
 They will let you out.
 Are you going my way?
 Sometimes the stars align.
 You're different
 from other girls.
 Thank you.
 And democracy is not perfect.
 You're casting a shadow,
 Kate.
 I take it you miss


# Clean data and split

Now that the files are read in and stored in a data structure that's easily accessible, I need to clean up the corpora and split then into n-grams.

Fortunately for me, I've worked on an NLP problem previously, so I can re-use some of the code.

First I convert the dictionary to a dataframe so I can process everything at once. The conversion from the dictionary structure to the dataframe structure is a bit long, so I threw it into helper function `convert_dict_df`.

Then I get rid of the punctuation in helper function `punct_tokens`:

- `newline_list`: variables; lists tabs (`\t`) and blank lines (`\r`: carriage return, `\n`: new line) for removal
- `punct_list`: variable; lists the following special characters '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'  for removal, plus a few that I've found useful in the past (previously I've also had to remove emojis)
- `str.translate`: method; removes the items in variable created with `str.maketrans` methods for new lines and punctuation

Next I remove the stopwords with the handy list provided in the nltk library and turn everything into n-grams using the helper function `create_ngrams`.

Lastly, I put everything together into a single function `create_ngram_df`.

In [35]:
def convert_dict_df(script_dict):
    return pd.DataFrame.from_dict(script_dict, orient='index').reset_index().rename(columns={'index':'script_name', 0:'corpus'})

def punct_tokens(df, text_col):
    newline_list = '\t\r\n'
    remove_newline = str.maketrans(' ', ' ', newline_list)
    punct_list = string.punctuation + '-‘_”'
    nopunct = str.maketrans('', '', punct_list)
    df['no_punct_tokens'] = df[text_col].fillna("").str.lower().str.translate(remove_newline).str.translate(nopunct).str.split()
    return df

def create_ngrams(df):
    stop = nltk.corpus.stopwords.words('english')
    df['unigrams'] = df['no_punct_tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['bigrams'] = df['unigrams'].apply(lambda x:(list(nltk.bigrams(x))))
    df['trigrams'] = df['unigrams'].apply(lambda x:(list(nltk.trigrams(x))))
    return df

def create_ngram_df(script_dict, text_col):
    df = convert_dict_df(script_dict)
    df = punct_tokens(df, text_col)
    df = create_ngrams(df)
    return df

In [36]:
authentic_ngram_df = create_ngram_df(unilayer_dict, 'corpus')
authentic_ngram_df

Unnamed: 0,script_name,corpus,no_punct_tokens,unigrams,bigrams,trigrams
0,The Twilight Zone - 3x17 - One More Pallbearer...,You're traveling\n through another dimension-...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."
1,The Twilight Zone - 3x05 - A Game of Pool.srt,You're traveling\n through another dimension-...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."
2,The Twilight Zone - 2x03 - Nervous Man in a Fo...,You're traveling\n through another dimension-...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."
3,The Twilight Zone - 4x05 - Mute.srt,You unlock this door\n with the key of imagin...,"[you, unlock, this, door, with, the, key, of, ...","[unlock, door, key, imagination, beyond, anoth...","[(unlock, door), (door, key), (key, imaginatio...","[(unlock, door, key), (door, key, imagination)..."
4,The Twilight Zone - 3x04 - The Passersby.srt,You're traveling\n through another dimension-...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."
...,...,...,...,...,...,...
116,The Twilight Zone - s05e36 - The Bewitchin' Po...,You unlock this door\n with the key of imagin...,"[you, unlock, this, door, with, the, key, of, ...","[unlock, door, key, imagination, beyond, anoth...","[(unlock, door), (door, key), (key, imaginatio...","[(unlock, door, key), (door, key, imagination)..."
117,The Twilight Zone - 3x03 - The Shelter.srt,You're traveling\n through another dimension-...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."
118,The Twilight Zone - s05e21 - Spur of the Momen...,You unlock this door\n with the key of imagin...,"[you, unlock, this, door, with, the, key, of, ...","[unlock, door, key, imagination, beyond, anoth...","[(unlock, door), (door, key), (key, imaginatio...","[(unlock, door, key), (door, key, imagination)..."
119,The Twilight Zone - 2x29 - The Obsolete Man.srt,You're traveling\n through another dimension\...,"[youre, traveling, through, another, dimension...","[youre, traveling, another, dimension, dimensi...","[(youre, traveling), (traveling, another), (an...","[(youre, traveling, another), (traveling, anot..."


I appreciate this data structure because if there is anything that doesn't make sense later in the analysis, I can search for it and track it back to the source, i.e. as long as I can find it in the designated n-gram column, I can see what the corpus looked like in the original form (the full concatenated string), after removal of punctuation, after removal of the stopwords, and converted to n-grams as well as being able to track it back to the script it came from because I have the script name.

To handle the multiple corpora of the 21st century scripts, I retained the dictionary-holding-another-data-structure set up. The name of each grouping ('Pan-Am', 'Mad_Med', 'The_Kennedys', 'X-Men_First_Class') is a key and the dataframe is the value. Using this, I can continue to reap the benefits of my functions, while keeping the groups, and their individual analyses, separate.

In [37]:
test_ngram_dict = {}
for script_group in list(bilayer_dict.keys()):
    test_ngram_dict[script_group] = create_ngram_df(bilayer_dict[script_group], 'corpus')

In [38]:
test_ngram_dict['Pan_Am']

Unnamed: 0,script_name,corpus,no_punct_tokens,unigrams,bigrams,trigrams
0,Pan.Am.S01E09.srt,"Previously on ""Pan Am""...\n Look, I get to se...","[previously, on, pan, am, look, i, get, to, se...","[previously, pan, look, get, see, world, sam, ...","[(previously, pan), (pan, look), (look, get), ...","[(previously, pan, look), (pan, look, get), (l..."
1,Pan.Am.S01E08.srt,"ï»¿1\n Previously on ""Pan Am""...\n Let's keep...","[ï»¿1, previously, on, pan, am, lets, keep, it...","[ï»¿1, previously, pan, lets, keep, new, york,...","[(ï»¿1, previously), (previously, pan), (pan, ...","[(ï»¿1, previously, pan), (previously, pan, le..."
2,Pan.Am.S01E05.srt,"Previously on ""Pan Am""...\n What do you think...","[previously, on, pan, am, what, do, you, think...","[previously, pan, think, youre, ran, away, wed...","[(previously, pan), (pan, think), (think, your...","[(previously, pan, think), (pan, think, youre)..."
3,Pan.Am.S01E11.srt,"Previously on ""Pan Am"".\n MI6 will want answe...","[previously, on, pan, am, mi6, will, want, ans...","[previously, pan, mi6, want, answers, take, li...","[(previously, pan), (pan, mi6), (mi6, want), (...","[(previously, pan, mi6), (pan, mi6, want), (mi..."
4,Pan.Am.S01E10.srt,"Previously on ""Pan Am""...\n I bet you've got ...","[previously, on, pan, am, i, bet, youve, got, ...","[previously, pan, bet, youve, got, surprises, ...","[(previously, pan), (pan, bet), (bet, youve), ...","[(previously, pan, bet), (pan, bet, youve), (b..."
5,Pan.Am.S01E04.srt,"Previously on ""Pan Am""...\n - You're gonna me...","[previously, on, pan, am, youre, gonna, meet, ...","[previously, pan, youre, gonna, meet, kennedy,...","[(previously, pan), (pan, youre), (youre, gonn...","[(previously, pan, youre), (pan, youre, gonna)..."
6,Pan.Am.S01E12.srt,"Previously on ""Pan Am"".\n We'd like to move y...","[previously, on, pan, am, wed, like, to, move,...","[previously, pan, wed, like, move, courier, ag...","[(previously, pan), (pan, wed), (wed, like), (...","[(previously, pan, wed), (pan, wed, like), (we..."
7,Pan.Am.S01E06.srt,"Previously on ""Pan Am"".\n Why don't you came ...","[previously, on, pan, am, why, dont, you, came...","[previously, pan, dont, came, fog, captain, af...","[(previously, pan), (pan, dont), (dont, came),...","[(previously, pan, dont), (pan, dont, came), (..."
8,Pan.Am.S01E07.srt,"Previously on ""Pan Am""...\n You smell like wh...","[previously, on, pan, am, you, smell, like, wh...","[previously, pan, smell, like, whiskey, cigare...","[(previously, pan), (pan, smell), (smell, like...","[(previously, pan, smell), (pan, smell, like),..."
9,Pan.Am.S01E13.srt,"Previously on ""Pan Am""...\n Let's keep it in ...","[previously, on, pan, am, lets, keep, it, in, ...","[previously, pan, lets, keep, new, york, ginny...","[(previously, pan), (pan, lets), (lets, keep),...","[(previously, pan, lets), (pan, lets, keep), (..."


# Frequencies

Counting words is a common sample problem and can probably be considered the 'hello world' of NLP. When putting it into a dictionary data structure, the concept isn't difficult:

- For each word (or in our case, n-gram) in the corpus
- Insert the word if it's not there (the dictionary key)
- Add 1 to the count (which is the stored value)

The question is, how to apply this general concept to my specific use case.

The n-grams have already been created, so I don't have to worry about longer n-grams (the bigrams, and I threw in trigrams because why not?) spilling from one scrip to another. Which means I can concatenate all the n-grams of a specific category together (i.e. I don't want to combine unigrams with bigrams, just all the unigrams with each other).

I already know I want to use the n-grams as my index/sort column, which means I'll need to create a separate dataframe for each set of frequencies. This both simplifies and complicates the process, since I won't be able to just add on to the same dataframe anymore.

The `frequency_ct` and `dict_to_df` functions that I created in the previous solution to the homework still work. The only new aspect is that I need to put all the n-gram lists together. My initial thought was to use `list.expand`, but that would require looping through every row of the dataframe, which isn't the most optimized solution.

Fortunately, there is an easy alternative: it's easily accomplished by using the `sum` method on the column as specified on this [StackOverflow answer](https://stackoverflow.com/a/42909969).

In [39]:
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 1
        else:
            freq_dict[ngram] +=1
    return freq_dict

def dict_to_df(freq_dict, gram_name, corpus_name):
    if (type(gram_name)==str) and (type(corpus_name)==str):
        pass
    else:
        print('gram and corpus variables must be strings')
    freq_colname = corpus_name+'_frequency'
    df = pd.DataFrame.from_dict(freq_dict, orient='index'
                               ).reset_index().rename(columns={'index':gram_name, 0:freq_colname}
                                                     ).sort_values(freq_colname, ascending=False)
    return df

def create_frequencies(ngram_list, gram_name, corpus_name):
    freq_dict = frequency_ct(ngram_list)
    freq_df = dict_to_df(freq_dict, gram_name, corpus_name)
    return freq_df

In [40]:
authentic_uni_freq = create_frequencies(authentic_ngram_df['unigrams'].sum(), 'unigram', 'authentic')
authentic_uni_freq

Unnamed: 0,unigram,authentic_frequency
206,well,2272
25,dont,2199
175,im,1988
26,know,1777
19,mr,1604
...,...,...
9877,fringe,1
9873,upfront,1
9872,afterregion,1
9871,facet,1


The `corpus_name` variable is important for later analysis. I'll need to compare the authentic corpus which was written in the 1960s about the 1960s to each of the corpora written in the 21st century about the 1960s. With the flow I've established, I'll need to merge dataframes to complete the analysis. This is most easily accomplished when the non-join-on columns have different names.

Example: If I join two dataframes with column names = `['unigram', 'frequency']` I'll end up with a single dataframe with the column names = `['unigram', 'x-frequency', 'y-frequency']`. I find these `x` and `y` prefixes less than informative and prefer to name my columns during processing.

But why stop my function at just the frequency? I also need normalized frequencies. Normalized frequencies level the playing field of straight counts when comparing corpora. With simple counts, a larger corpus will have n-grams with larger counts simply because there are more words overall than a smaller corpus. It doesn't necessarily reflect any relevant comparison.

Also, the homework problem requires getting ratios of the normalized frequencies later in the analysis.

In [41]:
def normalized_freq(freq_df, corpus_name):
    freq_col_name = corpus_name + '_frequency'
    norm_col_name = corpus_name + '_norm_freq'
    total_ct = freq_df[freq_col_name].sum()
    freq_df[norm_col_name] = freq_df[freq_col_name]/total_ct
    return freq_df

def create_frequencies(ngram_list, gram_name, corpus_name):
    freq_dict = frequency_ct(ngram_list)
    freq_df = dict_to_df(freq_dict, gram_name, corpus_name)
    freq_df = normalized_freq(freq_df, corpus_name)
    return freq_df

In [42]:
authentic_uni_freq = create_frequencies(authentic_ngram_df['unigrams'].sum(), 'unigram', 'authentic')
authentic_uni_freq

Unnamed: 0,unigram,authentic_frequency,authentic_norm_freq
206,well,2272,0.012132
25,dont,2199,0.011742
175,im,1988,0.010616
26,know,1777,0.009489
19,mr,1604,0.008565
...,...,...,...
9877,fringe,1,0.000005
9873,upfront,1,0.000005
9872,afterregion,1,0.000005
9871,facet,1,0.000005


In [43]:
test_uni_freq_dict = {}
for script_group in list(test_ngram_dict.keys()):
    test_uni_freq_dict[script_group] = create_frequencies(test_ngram_dict[script_group]['unigrams'].sum(), 'unigram', script_group)

In [44]:
test_uni_freq_dict['Mad_Men']

Unnamed: 0,unigram,Mad_Men_frequency,Mad_Men_norm_freq
136,dont,2606,0.017869
39,im,2574,0.017650
216,know,2442,0.016745
5,want,1638,0.011232
29,well,1574,0.010793
...,...,...,...
7605,jima,1,0.000007
7604,iwo,1,0.000007
7603,chandelier,1,0.000007
7601,suffocate,1,0.000007


# Compare corpora



The last piece of this homework challenge is to compare the authentic corpus (wrtten regarding the 1960s and penned in the 1960s) vs the four test corpora (written regarding the 1960s but not penned until the 21st century).

To compare anything to anything, first I need to combine the different dataframes holding my test corpora with the authentic corpus. I decided to do this by merging the values for the authentic data into each of the dataframes holding the values for the test data.

In [64]:
compare_dict = {}
for script_group in list(test_uni_freq_dict.keys()):
    compare_dict[script_group] = test_uni_freq_dict[script_group].merge(authentic_uni_freq, on='unigram', how='outer').fillna(0)

In [65]:
compare_dict['Pan_Am']

Unnamed: 0,unigram,Pan_Am_frequency,Pan_Am_norm_freq,authentic_frequency,authentic_norm_freq
0,im,489.0,0.015189,1988.0,0.010616
1,oh,407.0,0.012642,1580.0,0.008437
2,dont,379.0,0.011772,2199.0,0.011742
3,well,373.0,0.011586,2272.0,0.012132
4,know,323.0,0.010033,1777.0,0.009489
...,...,...,...,...,...
18005,fringe,0.0,0.000000,1.0,0.000005
18006,upfront,0.0,0.000000,1.0,0.000005
18007,afterregion,0.0,0.000000,1.0,0.000005
18008,facet,0.0,0.000000,1.0,0.000005


The equation I implemented in the previous solution to this homework was:

```
df['norm_freq_ratio'] = df.loc[(df['imitation_norm_freq'] != 0
                               ) & (df['authentic_norm_freq'] != 0), 'imitation_norm_freq'
                              ]/df.loc[(df['imitation_norm_freq'] != 0
                                       ) & (df['authentic_norm_freq'] != 0), 'authentic_norm_freq']
```

In order to implement this in the various dataframes, I'll need a way to identify the appropriate columns, regardless of which dataframe I'm working with. This can be done by looking for 'norm_freq' in the column names - which will pull out the normalized frequency for both the authentic and test data.

In [52]:
[compare_dict['Pan_Am'].columns[compare_dict['Pan_Am'].columns.str.contains('norm_freq')]]

[Index(['Pan_Am_norm_freq', 'authentic_norm_freq'], dtype='object')]

Referencing the dataframe by the dictionary and script group name is getting rather tedious, so I can just set the dictionary/script name as the dataframe I'm working with. This has a much cleaner appearance and, more importantly, is easier to read. Regardless of how good (or not) code is, it's much more common to have to read code in order to improve, maintain, update, or repair it than write it. My philosophy is to make code as easy to read as possible, so that my future self can decipher what I was thinking when I wrote it the first time around.

In [66]:
test = compare_dict['Pan_Am']
test_cols = test.columns[test.columns.str.contains('norm_freq')]
test_cols

Index(['Pan_Am_norm_freq', 'authentic_norm_freq'], dtype='object')

Now I can update my code to the more readable version. Since I use the test dataframe as the left and the authentic dataframe as the right in the join, I can count on the fact that the test:authentic columns will always be in the same order.

As an added bonus, I only have to write to the dictionary once instead of the initial write, then the update with the new columns.

In [78]:
compare_dict = {}
for script_group in list(test_uni_freq_dict.keys()):
    df = test_uni_freq_dict[script_group].merge(authentic_uni_freq, on='unigram', how='outer').fillna(0)
    freq_cols = df.columns[df.columns.str.contains('norm_freq')]
    df['norm_freq_ratio'] = df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[0]] / df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[1]]
    compare_dict[script_group] = df

In [79]:
compare_dict['Pan_Am']

Unnamed: 0,unigram,Pan_Am_frequency,Pan_Am_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
0,im,489.0,0.015189,1988.0,0.010616,1.430801
1,oh,407.0,0.012642,1580.0,0.008437,1.498387
2,dont,379.0,0.011772,2199.0,0.011742,1.002538
3,well,373.0,0.011586,2272.0,0.012132,0.954965
4,know,323.0,0.010033,1777.0,0.009489,1.057309
...,...,...,...,...,...,...
18005,fringe,0.0,0.000000,1.0,0.000005,
18006,upfront,0.0,0.000000,1.0,0.000005,
18007,afterregion,0.0,0.000000,1.0,0.000005,
18008,facet,0.0,0.000000,1.0,0.000005,


## High Ratios

High ratios for the normalized frequency show unigrams that were used commonly in the 21st-century scripts, but were extremely rare (but present) in 1960s scripts.

In [82]:
for script_group in compare_dict.keys():
    print(script_group)
    display(compare_dict[script_group].sort_values('norm_freq_ratio', ascending=False).head(50))
    print('\n')

Pan_Am


Unnamed: 0,unigram,Pan_Am_frequency,Pan_Am_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
51,dean,87.0,0.002702,1.0,5e-06,506.064637
18,pan,160.0,0.00497,3.0,1.6e-05,310.231195
162,amanda,32.0,0.000994,1.0,5e-06,186.138717
89,stewardess,54.0,0.001677,2.0,1.1e-05,157.054543
197,teddy,27.0,0.000839,1.0,5e-06,157.054543
281,stewardesses,19.0,0.00059,1.0,5e-06,110.519863
364,ryan,15.0,0.000466,1.0,5e-06,87.252524
456,cia,13.0,0.000404,1.0,5e-06,75.618854
452,ich,13.0,0.000404,1.0,5e-06,75.618854
491,monte,12.0,0.000373,1.0,5e-06,69.802019




Mad_Men


Unnamed: 0,unigram,Mad_Men_frequency,Mad_Men_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
138,sterling,170.0,0.001166,2.0,1.1e-05,109.15141
172,sally,143.0,0.000981,2.0,1.1e-05,91.815598
54,draper,365.0,0.002503,6.0,3.2e-05,78.118166
238,jesus,108.0,0.000741,2.0,1.1e-05,69.343249
553,francis,42.0,0.000288,1.0,5e-06,53.933638
317,clients,74.0,0.000507,2.0,1.1e-05,47.512967
187,joan,134.0,0.000919,4.0,2.1e-05,43.018497
195,betty,128.0,0.000878,4.0,2.1e-05,41.092295
435,jimmy,55.0,0.000377,2.0,1.1e-05,35.313691
457,ken,52.0,0.000357,2.0,1.1e-05,33.38749




X-Men_First_Class


Unnamed: 0,unigram,X-Men_First_Class_frequency,X-Men_First_Class_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
79,cia,10.0,0.002281,1.0,5e-06,427.076397
171,commands,5.0,0.00114,1.0,5e-06,213.538198
103,cuba,9.0,0.002052,2.0,1.1e-05,192.184379
210,sebastian,4.0,0.000912,1.0,5e-06,170.830559
211,shaws,4.0,0.000912,1.0,5e-06,170.830559
364,x,3.0,0.000684,1.0,5e-06,128.122919
326,presentation,3.0,0.000684,1.0,5e-06,128.122919
284,moscow,3.0,0.000684,1.0,5e-06,128.122919
264,threat,3.0,0.000684,1.0,5e-06,128.122919
370,homo,3.0,0.000684,1.0,5e-06,128.122919




The_Kennedys


Unnamed: 0,unigram,The_Kennedys_frequency,The_Kennedys_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
16,bobby,112.0,0.006192,3.0,1.6e-05,386.54975
86,khrushchev,30.0,0.001659,1.0,5e-06,310.620335
103,sighs,25.0,0.001382,1.0,5e-06,258.850279
165,rosemary,18.0,0.000995,1.0,5e-06,186.372201
12,kennedy,128.0,0.007077,9.0,4.8e-05,147.257048
101,cuba,25.0,0.001382,2.0,1.1e-05,129.42514
37,ii,60.0,0.003317,5.0,2.7e-05,124.248134
298,election,11.0,0.000608,1.0,5e-06,113.894123
163,ethel,18.0,0.000995,2.0,1.1e-05,93.186101
399,elected,8.0,0.000442,1.0,5e-06,82.832089






In [123]:
high_score_results = pd.DataFrame(columns = ['script', 'score'])
for script_group in compare_dict.keys():
    high_score_results = high_score_results.append(
        {'script':script_group,
         'score':compare_dict[script_group].sort_values('norm_freq_ratio', ascending=False).head(50)['norm_freq_ratio'].sum()
        }, ignore_index=True)
display(high_score_results.sort_values('score'))
print('Best performing corpus (lowest score) {}'.format(high_score_results.iloc[high_score_results['score'].idxmin(), 0]))
print('Worst performing corpus (highest score) {}'.format(high_score_results.iloc[high_score_results['score'].idxmax(), 0]))

Unnamed: 0,script,score
1,Mad_Men,1456.975643
0,Pan_Am,3336.81119
3,The_Kennedys,3980.829683
2,X-Men_First_Class,4282.152672


Best performing corpus (lowest score) Mad_Men
Worst performing corpus (highest score) X-Men_First_Class


## Low Ratios

Low ratios for the normalized frequency show unigrams that were used commonly in 1960, but were rare in the 21st-century scripts.

In [83]:
for script_group in compare_dict.keys():
    print(script_group)
    display(compare_dict[script_group].sort_values('norm_freq_ratio').head(50))
    print('\n')

Pan_Am


Unnamed: 0,unigram,Pan_Am_frequency,Pan_Am_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
5337,honey,1.0,3.1e-05,152.0,0.000812,0.038269
3797,imagination,1.0,3.1e-05,138.0,0.000737,0.042151
3761,ship,1.0,3.1e-05,137.0,0.000732,0.042459
3260,human,1.0,3.1e-05,101.0,0.000539,0.057592
3247,major,1.0,3.1e-05,76.0,0.000406,0.076537
4218,machine,1.0,3.1e-05,75.0,0.0004,0.077558
3352,radio,1.0,3.1e-05,74.0,0.000395,0.078606
3118,jerry,1.0,3.1e-05,69.0,0.000368,0.084302
4146,shadow,1.0,3.1e-05,68.0,0.000363,0.085542
3089,martin,1.0,3.1e-05,66.0,0.000352,0.088134




Mad_Men


Unnamed: 0,unigram,Mad_Men_frequency,Mad_Men_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
3985,twilight,3.0,2.1e-05,499.0,0.002665,0.00772
3764,zone,4.0,2.7e-05,506.0,0.002702,0.010151
12149,doc,1.0,7e-06,57.0,0.000304,0.022529
3302,captain,4.0,2.7e-05,208.0,0.001111,0.024695
10348,commander,1.0,7e-06,52.0,0.000278,0.024695
9954,emma,1.0,7e-06,48.0,0.000256,0.026753
10925,ace,1.0,7e-06,47.0,0.000251,0.027322
10578,schmidt,1.0,7e-06,46.0,0.000246,0.027916
7970,base,1.0,7e-06,45.0,0.00024,0.028536
4642,sight,3.0,2.1e-05,131.0,0.0007,0.029408




X-Men_First_Class


Unnamed: 0,unigram,X-Men_First_Class_frequency,X-Men_First_Class_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
212,mr,4.0,0.000912,1604.0,0.008565,0.106503
1516,boy,1.0,0.000228,311.0,0.001661,0.137324
1532,away,1.0,0.000228,305.0,0.001629,0.140025
874,hear,1.0,0.000228,297.0,0.001586,0.143797
782,long,1.0,0.000228,293.0,0.001565,0.14576
405,old,2.0,0.000456,526.0,0.002809,0.162386
1132,minute,1.0,0.000228,217.0,0.001159,0.196809
986,captain,1.0,0.000228,208.0,0.001111,0.205325
1374,room,1.0,0.000228,201.0,0.001073,0.212476
479,night,2.0,0.000456,394.0,0.002104,0.21679




The_Kennedys


Unnamed: 0,unigram,The_Kennedys_frequency,The_Kennedys_norm_freq,authentic_frequency,authentic_norm_freq,norm_freq_ratio
1774,zone,2.0,0.000111,506.0,0.002702,0.040925
1874,earth,1.0,5.5e-05,194.0,0.001036,0.053371
2646,game,1.0,5.5e-05,123.0,0.000657,0.084179
3250,guess,1.0,5.5e-05,111.0,0.000593,0.093279
3314,kill,1.0,5.5e-05,106.0,0.000566,0.097679
1328,sound,2.0,0.000111,205.0,0.001095,0.101015
2802,hot,1.0,5.5e-05,101.0,0.000539,0.102515
2239,key,1.0,5.5e-05,97.0,0.000518,0.106742
2959,space,1.0,5.5e-05,90.0,0.000481,0.115045
2352,ought,1.0,5.5e-05,88.0,0.00047,0.117659






In [122]:
low_score_results = pd.DataFrame(columns = ['script', 'score'])
for script_group in compare_dict.keys():
    low_score_results = low_score_results.append(
        {'script':script_group,
         'score':compare_dict[script_group].sort_values('norm_freq_ratio').head(50)['norm_freq_ratio'].sum()
        }, ignore_index=True)
display(low_score_results.sort_values('score', ascending=False))
print('Best performing corpus (highest score) {}'.format(low_score_results.iloc[low_score_results['score'].idxmax(), 0]))
print('Worst performing corpus (lowest score) {}'.format(low_score_results.iloc[low_score_results['score'].idxmin(), 0]))

Unnamed: 0,script,score
2,X-Men_First_Class,13.255571
3,The_Kennedys,7.791826
0,Pan_Am,6.533376
1,Mad_Men,2.309133


Best performing corpus (highest score) X-Men_First_Class
Worst performing corpus (lowest score) Mad_Men


# Ranking

The scores returned both as top and bottom normalized frequency ratios are bad things:

- The 50 highest ratios are words that were used frequently in the 21st century scripts, but were rare in the 1960s
- the 50 lowest ratios are words that were used frequently in the 1960s, but showed up rarely in the 21st century scripts

In the high ratios set, the higher the ratio, the further the script is from the authentic corpus. In the low ratios set, the higher the ratio, the closer the script is to the authentic corpus. So to get my ranking, I'm going to subtract the low ratio from the high ratio. The script corpora will then be sorted from lowest (best) to highest (worst) score.

In [127]:
results = pd.DataFrame(columns = ['script', 'high_ratio', 'low_ratio'])
for script_group in compare_dict.keys():
    results = results.append(
        {'script':script_group,
         'high_ratio':compare_dict[script_group].sort_values('norm_freq_ratio', ascending=False).head(50)['norm_freq_ratio'].sum(),
         'low_ratio':compare_dict[script_group].sort_values('norm_freq_ratio').head(50)['norm_freq_ratio'].sum()
        }, ignore_index=True)
    results['combined_score'] = results['high_ratio'] - results['low_ratio']
    results = results.sort_values('combined_score')
    results['rank'] = range(1, 1+len(results))
display(results)

Unnamed: 0,script,high_ratio,low_ratio,combined_score,rank
0,Mad_Men,1456.975643,2.309133,1454.66651,1
1,Pan_Am,3336.81119,6.533376,3330.277814,2
3,The_Kennedys,3980.829683,7.791826,3973.037857,3
2,X-Men_First_Class,4282.152672,13.255571,4268.897101,4


# Caveats

There are several problems with this exercise and the solution.

## Corpus data processing

The biggest initial problem for me was the fact that punctuation wasn't removed, the n-grams were case sensitive, and stopwords weren't removed. The first two mean that words aren't counted appropriately, especially when they're prone to different capitalizations and uses with punctuation. For example, in the initial solution I noticed 'daddy' written several ways. Here are several ways 'daddy' could be included in a script

- Daddy.
- daddy.
- Daddy
- daddy
- Daddy!
- daddy!

This is six iterations for a single word which should all be counded together.

The last point, stopwords weren't removed, means that there's a lot of meaningless noise; Words like 'the', 'a', 'an', 'of', 'for', etc remain in the analysis.

## Pronouns

Related to proper counting and stopwords are proper nouns. In a script or novel, the names of the characters of the story will show up a disporportionate amount of the time. With a large enough corpus this becomes moot because names common to the era will naturally show up more than modern names. However, these corpora aren't large enough for this averaging of character names. The same is true for place names. The location the script is set has a higher likelihood of being mentioned.

## Ratio impact

As can be seen in the final results dataframe, the high ratios have a much larger impact on my ranking than the lower numbers. This means that including words that were rare in the 1960s has a much bigger impact on the ranking than excluding words that were common.

### Repetition

The authentic 1960s corpus includes many, many The Twilight Zone episodes. Most, if not all, of The Twilight Zone episodes start with the same introduction. This means that words like 'traveling', 'another', 'dimension', 'sight', 'sound', 'mind', and 'journey' are disproportionately represented. An improvement to the analysis would be to account for and remove this repetition so that it's only represented once in the frequencies.