In [1]:
import pandas as pd
import os
import re

In [50]:
def list_filepaths(append_list, file_path = os.getcwd()):
    for thing in os.scandir(file_path):
        if thing.is_dir():
            new_path = os.path.join(file_path, thing.name)
            list_filepaths(append_list, new_path)
        elif thing.is_file:
            append_list.append(f'{file_path}/{thing.name}')
    return append_list

def read_script(file_path):
    word_list = []
    with open(file_path, 'r', encoding='latin-1') as l:
        for line in l:
            if (re.match('^(?!\s*$).+', line) # blank lines
               ) and (re.match('[^\d+]', line) # all digits, i.e. no letters
                     ) and not (re.match('(Downloaded From www.*)', line)
                               ) and not (re.match('(Shared by http:*)', line)):
                line = re.sub('\n|</?i>|</?font.*>|#|-|(|)|"', '', line)
                word_list.append(line)
    return word_list

def combine_then_split(line_dict):
    word_list = []
    for key in line_dict.keys():
        word_list.extend(line_dict[key])
    combo_string = ' '.join(word_list)
    unigram_list = combo_string.split()
    return unigram_list

def frequency_ct(unigram_list):
    freq_dict = {}
    for word in unigram_list:
        if word not in freq_dict:
            freq_dict[word] = 1
        else:
            freq_dict[word] +=1
    return freq_dict

In [66]:
def dict_to_df(freq_dict, gram, corpus):
    if (type(gram)==str) and (type(corpus)==str):
        pass
    else:
        print('gram and corpus variables must be strings')
    freq_colname = corpus+'_frequency'
    df = pd.DataFrame.from_dict(freq_dict, orient='index'
                               ).reset_index().rename(columns={'index':gram, 0:freq_colname}
                                                     ).sort_values(freq_colname, ascending=False)
    return df

In [4]:
authentic_60s = []
list_filepaths(authentic_60s, os.path.join(os.getcwd(), 'homework8', '1960s'))
authentic_60s[:2]

['/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x17 - One More Pallbearer.srt',
 '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/1960s/The Twilight Zone - 3x05 - A Game of Pool.srt']

In [5]:
authentic_dict = {}
for i, file_path in enumerate(authentic_60s):
    authentic_dict[i] = read_script(file_path)

In [21]:
authentic_unigrams = combine_then_split(authentic_dict)

In [51]:
authentic_frequency = frequency_ct(authentic_unigrams)

In [67]:
authentic_df = dict_to_df(authentic_frequency, 'unigram', 'authentic')
authentic_df.head()

Unnamed: 0,unigram,authentic_frequency
26,the,10819
42,to,8486
48,you,8478
5,a,8455
8,of,6166


In [6]:
imitation_60s = []
list_filepaths(imitation_60s, os.path.join(os.getcwd(), 'homework8', '21st-century'))
imitation_60s[:2]

['/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E09.srt',
 '/Users/julie.fisher/Documents/datascience_diaries/continuing_education/classes/1_cse140/homework8/21st-century/Pan_Am/Pan.Am.S01E08.srt']

In [7]:
imitation_dict = {}
for i, file_path in enumerate(imitation_60s):
    imitation_dict[i] = read_script(file_path)

In [23]:
imitation_unigrams = combine_then_split(imitation_dict)

In [55]:
imitation_frequency = frequency_ct(imitation_unigrams)

In [68]:
imitation_df = dict_to_df(imitation_frequency, 'unigram', 'imitation')
imitation_df.head()

Unnamed: 0,unigram,imitation_frequency
5,I,14127
9,the,10520
7,to,10503
16,you,10129
66,a,8161


In [71]:
df = authentic_df.merge(imitation_df, on='unigram', how='outer').fillna(0)
df.head()

Unnamed: 0,unigram,authentic_frequency,imitation_frequency
0,the,10819.0,10520.0
1,to,8486.0,10503.0
2,you,8478.0,10129.0
3,a,8455.0,8161.0
4,of,6166.0,4499.0


Your program should print:

- the 50 most frequent words (1-grams) in the test corpus that do not appear in the authentic corpus, and
- the 50 most frequent words (1-grams) in the authentic corpus that do not appear in the test corpus.
- Print the words in order from most to least frequent, or in lexicographic order if they are equally common.

1-grams: Test vs. Authentic
- (('Trudy',), 30)
- (('Heinz',), 22)
- (("Hilton's",), 22)
- ... Insert more data here ...
- (('Jaguar',), 18)

1-grams: Authentic vs. Test
- (('creator',), 58)
- (('unlock',), 58)
- (('Rod',), 29)
- ... Insert more data here ...
- (('santa',), 23)

- www.addic7ed.com
- (
- Sync
- n17t01

In [77]:
# the 50 most frequent words (1-grams) in the test corpus that do not appear in the authentic corpus
df[df['authentic_frequency']==0].sort_values('unigram').sort_values('imitation_frequency', ascending=False).head(50)

Unnamed: 0,unigram,authentic_frequency,imitation_frequency
33449,âª,0.0,529.0
33450,Don.,0.0,191.0
33451,Pan,0.0,164.0
33452,II,0.0,117.0
33453,Sterling,0.0,110.0
33454,Draper,0.0,106.0
33455,(,0.0,104.0
33456,Kennedy,0.0,84.0
33457,www.addic7ed.com,0.0,74.0
33458,Sync,0.0,73.0


In [79]:
# the 50 most frequent words (1-grams) in the authentic corpus that do not appear in the test corpus
df[df['imitation_frequency']==0].sort_values('unigram', key=lambda col: col.str.lower()).sort_values('authentic_frequency', ascending=False).head(50)

Unnamed: 0,unigram,authentic_frequency,imitation_frequency
151,dimension,340.0,0.0
188,i've,259.0,0.0
338,i...,139.0,0.0
405,"sound,",111.0,0.0
433,"zone,",98.0,0.0
483,Serling.,85.0,0.0
576,wondrous,70.0,0.0
638,Amen,62.0,0.0
701,Rod,55.0,0.0
700,"serling,",55.0,0.0


Augment your program so that it computes, for each word in each corpus, that word's normalized frequency. The normalized frequency of a word is equal to the number of times the word occurs in a corpus, divided by the total number of words in the corpus (including duplicate words). For example, in a corpus of 10,000 words, if a word appears 42 times, its normalized frequency would be .0042.

Now, for each word that appears in both corpora, compute the ratio of its test-corpus normalized frequency to its authentic-corpus normalized frequency. Use this ratio to rank 1-grams, much like you did for frequency earlier.

For example, suppose the word "Dog" appears 20 times in the test corpus of 1000 words, and 250 times in the authentic corpus of 10000. The test-corpus normalized frequency of "Dog" is 20/1000, or 0.02. The authentic-corpus normalized frequency of "Dog" is 250/10000, or 0.025. The ratio of test-corpus normalized frequency to authentic-corpus normalized frequency is 0.02/0.025, which equals 0.8.

In [84]:
auth_count = df['authentic_frequency'].sum()
imitation_count = df['imitation_frequency'].sum()
df['authentic_norm_freq'] = df['authentic_frequency']/auth_count
df['imitation_norm_freq'] = df['imitation_frequency']/imitation_count
df

Unnamed: 0,unigram,authentic_frequency,imitation_frequency,authentic_norm_freq,imitation_norm_freq,norm_freq_ratio
0,the,10819.0,10520.0,0.029926,0.026366,0.881038
1,to,8486.0,10503.0,0.023473,0.026323,1.121440
2,you,8478.0,10129.0,0.023450,0.025386,1.082527
3,a,8455.0,8161.0,0.023387,0.020453,0.874572
4,of,6166.0,4499.0,0.017055,0.011276,0.661117
...,...,...,...,...,...,...
52666,12%.,0.0,1.0,0.000000,0.000003,inf
52667,203,0.0,1.0,0.000000,0.000003,inf
52668,chattel.,0.0,1.0,0.000000,0.000003,inf
52669,"useless,",0.0,1.0,0.000000,0.000003,inf


In [85]:
# df['norm_freq_ratio'] = df['imitation_norm_freq']/df['authentic_norm_freq']

df['norm_freq_ratio'] = df.loc[(df['imitation_norm_freq'] != 0
                               ) & (df['authentic_norm_freq'] != 0), 'imitation_norm_freq'
                              ]/df.loc[(df['imitation_norm_freq'] != 0
                                       ) & (df['authentic_norm_freq'] != 0), 'authentic_norm_freq']

In [87]:
df.sort_values('norm_freq_ratio', ascending=False).head(50)

Unnamed: 0,unigram,authentic_frequency,imitation_frequency,authentic_norm_freq,imitation_norm_freq,norm_freq_ratio
30060,God.,1.0,93.0,3e-06,0.000233,84.265291
25016,Roger,1.0,93.0,3e-06,0.000233,84.265291
8381,Don,3.0,186.0,8e-06,0.000466,56.176861
20432,Don?,1.0,56.0,3e-06,0.00014,50.74039
23503,Sally,1.0,52.0,3e-06,0.00013,47.116077
30000,Kennedy.,1.0,46.0,3e-06,0.000115,41.679606
31716,Lucky,1.0,46.0,3e-06,0.000115,41.679606
24144,Betty.,1.0,46.0,3e-06,0.000115,41.679606
9067,American,3.0,133.0,8e-06,0.000333,40.169476
24288,Joe.,1.0,41.0,3e-06,0.000103,37.149214


In [88]:
df.sort_values('norm_freq_ratio').head(50)

Unnamed: 0,unigram,authentic_frequency,imitation_frequency,authentic_norm_freq,imitation_norm_freq,norm_freq_ratio
7,i,4450.0,4.0,0.012309,1e-05,0.000814
145,i'll,355.0,1.0,0.000982,3e-06,0.002552
147,zone.,354.0,1.0,0.000979,3e-06,0.00256
113,twilight,481.0,2.0,0.00133,5e-06,0.003767
58,i'm,819.0,7.0,0.002265,1.8e-05,0.007744
294,i'd,162.0,2.0,0.000448,5e-06,0.011186
304,?,157.0,2.0,0.000434,5e-06,0.011542
544,journey,73.0,1.0,0.000202,3e-06,0.012412
593,sight,68.0,1.0,0.000188,3e-06,0.013325
658,boundaries,60.0,1.0,0.000166,3e-06,0.015101


## Bigrams

Bigrams do not span between files. For example, if file1.srt ends with the line “The End.”, and file2.srt begins with the line “Hello World!”, the bigram ('End.', 'Hello') should not be included in your analysis.
Bigrams span between subtitle text lines in the same file. For example, the subtitle text lines 'This is not an exercise.' and 'Not an exercise?' appear one after the other in the file Dr.Strangelove.srt. The bigram spanning these lines, ('exercise.', 'Not'), should be included in your analysis.
Your program should discard every bigram that contains a word that appears in only one corpus.

### Caveat

I won't be coding this out. Instead, I'm going to do more sophisticated data processing, use the uni/bi/trigram processing available through a library like nltk, and look at the more interesting results (which will have punctuation removed - "Daddy!" and "Daddy." will be counted as the same word, will be case insensitive - "Daddy" and "daddy" will be the same, and will have stop words removed - "the", "a", "for", etc)

If I were to do the bigrams here is how I'd code it:

1. Make a list of the words that don't appear in one corpus or the other from the unigram analysis
2. Process the data with the list_filepaths and read_script functions
3. Alter the combine_then_split function to:
    - combine the words from a single script
    - remove any words from the list in step 1
    - split word list into unigrams
    - Use a for loop and enumerate() to go through the list by index and join the index-word and the word after it. Example: "The brown dog wore red" -> "The brown", "brown dog", "dog wore", "wore red"
    - put all pairs into a list so that the output of combine_then_split is a bigram_list
4. Process the bigram_list with frequency_ct
5. All further steps would be the same as those used for the unigram analysis