In [1]:
import pandas as pd
import os
from IPython.display import display

import string
import re
import itertools
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/julie.fisher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Grab and store the data
def read_script(file_path):
    corpus = ''
    with open(file_path, 'r', encoding='latin-1') as l:
        for line in l:
            if (re.match('[^\d+]', line)
               ) and (re.match('^(?!\s*$).+', line)
                      ) and not (re.match('(.*www.*)|(.*http:*)', line)
                                ) and not (re.match('Sync and correct*', line)):
                line = re.sub('</?i>|</?font.*>', '', line)
                corpus = corpus + ' ' + line
    return corpus

def load_files_to_dict(file_path, return_dict):    
    for thing in os.scandir(file_path):
        if thing.is_dir():
            new_path = os.path.join(file_path, thing.name)
            new_dict = return_dict[thing.name] = {}
            load_files_to_dict(new_path, new_dict)
        elif thing.is_file:
            return_dict[thing.name] = read_script(f'{file_path}/{thing.name}')
    return return_dict

In [12]:
def convert_dict_df(script_dict):
    return pd.DataFrame.from_dict(script_dict, orient='index').reset_index().rename(columns={'index':'script_name', 0:'corpus'})

# Clean the text and create ngrams
def punct_tokens(df, text_col):
    newline_list = '\t\r\n'
    remove_newline = str.maketrans(' ', ' ', newline_list)
    punct_list = string.punctuation + '-‘_”'
    nopunct = str.maketrans('', '', punct_list)
    df['no_punct_tokens'] = df[text_col].fillna("").str.lower().str.translate(remove_newline).str.translate(nopunct).str.split()
    return df

def create_ngrams(df):
    stop = nltk.corpus.stopwords.words('english')
    df['unigrams'] = df['no_punct_tokens'].apply(lambda x: [item for item in x if item not in stop])
    df['bigrams'] = df['unigrams'].apply(lambda x:(list(nltk.bigrams(x))))
    df['trigrams'] = df['unigrams'].apply(lambda x:(list(nltk.trigrams(x))))
    return df

def create_ngram_df(script_dict, text_col):
    df = convert_dict_df(script_dict)
    df = punct_tokens(df, text_col)
    df = create_ngrams(df)
    return df

In [4]:
# Get frequency counts and normalized frequency
def frequency_ct(ngram_list):
    freq_dict = {}
    for ngram in ngram_list:
        if ngram not in freq_dict:
            freq_dict[ngram] = 1
        else:
            freq_dict[ngram] +=1
    return freq_dict

def dict_to_df(freq_dict, gram_name, corpus_name):
    if (type(gram_name)==str) and (type(corpus_name)==str):
        pass
    else:
        print('gram and corpus variables must be strings')
    freq_colname = corpus_name+'_frequency'
    df = pd.DataFrame.from_dict(freq_dict, orient='index'
                               ).reset_index().rename(columns={'index':gram_name, 0:freq_colname}
                                                     ).sort_values(freq_colname, ascending=False)
    return df

def normalized_freq(freq_df, corpus_name):
    freq_col_name = corpus_name + '_frequency'
    norm_col_name = corpus_name + '_norm_freq'
    total_ct = freq_df[freq_col_name].sum()
    freq_df[norm_col_name] = freq_df[freq_col_name]/total_ct
    return freq_df

def create_frequencies(ngram_list, gram_name, corpus_name):
    freq_dict = frequency_ct(ngram_list)
    freq_df = dict_to_df(freq_dict, gram_name, corpus_name)
    freq_df = normalized_freq(freq_df, corpus_name)
    return freq_df

In [42]:
# Compare test corpus to authentic corpus and rank corpora

def combine_test_authentic(test_freq_dict, authentic_freq, ngram):
    compare_dict = {}
    for script_group in list(test_freq_dict.keys()):
        df = test_freq_dict[script_group].merge(authentic_freq, on=ngram, how='outer').fillna(0)
        freq_cols = df.columns[df.columns.str.contains('norm_freq')]
        df['norm_freq_ratio'] = df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[0]] / df.loc[(df[freq_cols[0]]!=0) & (df[freq_cols[1]]!=0), freq_cols[1]]
        compare_dict[script_group] = df
    return compare_dict

def get_ranking(compare_dict):
    results = pd.DataFrame(columns = ['script', 'high_ratio', 'low_ratio'])
    for script_group in list(compare_dict.keys()):
        results = results.append(
            {'script':script_group,
             'high_ratio':compare_dict[script_group].sort_values('norm_freq_ratio', ascending=False).head(50)['norm_freq_ratio'].sum(),
             'low_ratio':compare_dict[script_group].sort_values('norm_freq_ratio').head(50)['norm_freq_ratio'].sum()
            }, ignore_index=True)
        results['combined_score'] = results['high_ratio'] - results['low_ratio']
        results = results.sort_values('combined_score')
        results['rank'] = range(1, 1+len(results))
    return results
    
def read_and_rank(test_freq_dict, authentic_freq, ngram):
    compare_dict = combine_test_authentic(test_freq_dict, authentic_freq, ngram)
    results = get_ranking(compare_dict)
    return results

In [43]:
def create_ngram_dfs(authentic_file_path, test_file_path, text_col='corpus'):
    raw_auth_dict = load_files_to_dict(authentic_file_path, {})
    raw_test_dict = load_files_to_dict(test_file_path, {})

    auth_ngram_df = create_ngram_df(raw_auth_dict, text_col)
    test_ngram_dict = {}
    for script_group in list(raw_test_dict.keys()):
        test_ngram_dict[script_group] = create_ngram_df(raw_test_dict[script_group], text_col)

    return auth_ngram_df, test_ngram_dict

def rank_by_ngram(auth_ngram_df, test_ngram_dict, ngram):
    ngram_col = f'{ngram}s'
    auth_freq = create_frequencies(auth_ngram_df[ngram_col].sum(), ngram, 'authentic')
    test_freq_dict = {}
    for script_group in list(test_ngram_dict.keys()):
        test_freq_dict[script_group] = create_frequencies(test_ngram_dict[script_group][ngram_col].sum(), ngram, script_group)
    results = read_and_rank(test_freq_dict, auth_freq, ngram)
    return results

In [None]:
auth_file_path = os.path.join(os.getcwd(), 'homework8', '1960s')
test_file_path = os.path.join(os.getcwd(), 'homework8', '21st-century')

auth_ngram_df, test_ngram_dict = create_ngram_dfs(auth_file_path, test_file_path, 'corpus')

uni_results = rank_by_ngram(auth_ngram_df, test_ngram_dict, 'unigram')
bi_results = rank_by_ngram(auth_ngram_df, test_ngram_dict, 'bigram')
tri_results = rank_by_ngram(auth_ngram_df, test_ngram_dict, 'trigram')

In [45]:
uni_results

Unnamed: 0,script,high_ratio,low_ratio,combined_score,rank
0,Mad_Men,1456.975643,2.309133,1454.66651,1
1,Pan_Am,3336.81119,6.533376,3330.277814,2
3,The_Kennedys,3980.829683,7.791826,3973.037857,3
2,X-Men_First_Class,4282.152672,13.255571,4268.897101,4


In [46]:
bi_results

Unnamed: 0,script,high_ratio,low_ratio,combined_score,rank
0,Mad_Men,766.474429,4.291584,762.182845,1
1,Pan_Am,1203.830335,9.976971,1193.853364,2
3,The_Kennedys,2110.924941,16.62274,2094.302201,3
2,X-Men_First_Class,2938.479927,56.719535,2881.760392,4


In [47]:
tri_results

Unnamed: 0,script,high_ratio,low_ratio,combined_score,rank
0,Mad_Men,387.439022,12.212672,375.22635,1
1,Pan_Am,592.097186,49.928053,542.169133,2
3,The_Kennedys,931.480826,105.833998,825.646827,3
2,X-Men_First_Class,2218.939539,1318.225023,900.714517,4
