In [None]:
# Import packages, metadata

import pickle
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

meta_df = pd.read_csv('metadata-rh-np.csv')

In [None]:
# Add Column for Classes

meta_df['CLASS'] = meta_df['PUBLISHER'] != "RANDOM HOUSE"

In [None]:
# Import Texts

text_list = []

for fpath in meta_df['FILE_PATH']:
    with open(fpath, 'r') as file_in:
        text = file_in.read()
        text_list.append(text)

In [None]:
# Process Texts into DTM

cv = CountVectorizer()
dtm = cv.fit_transform(text_list)
dtm_df = pd.DataFrame(dtm.toarray(), columns = cv.get_feature_names())

In [None]:
# Additional Processing Functions

def top_n_features(dtm_, top_n = 5000):
    return dtm_.sum().sort_values()[::-1][:top_n].index.tolist()

def normalize(train_dtm_, test_dtm_):
    
    train_dtm_ = train_dtm_.div(train_dtm_.sum(axis=1), axis = 'rows')
    test_dtm_ = test_dtm_.div(test_dtm_.sum(axis=1), axis = 'rows')
    
    train_mean, train_std = train_dtm_.mean(), train_dtm_.std()
    
    train_dtm_ = ( train_dtm_ - train_mean ) / train_std
    test_dtm_ = ( test_dtm_ - train_mean ) / train_std
    
    return train_dtm_, test_dtm_

In [None]:
# Bootstrap Model

def author_max(meta_, thresh = 2):
    
    np_auth_array = meta_[meta_['CLASS']]['AUTHOR'].unique()
    rh_auth_array = meta_[~meta_['CLASS']]['AUTHOR'].unique()
    
    np_sub_index = [meta_[ (meta_['CLASS']) & (meta_['AUTHOR']==auth) ].index.tolist() for auth in np_auth_array]
    np_sub_index = [index_ for index_list in np_sub_index
                    for index_ in np.random.choice( index_list, min( thresh, len(index_list) ), replace = False ) ]
    
    rh_sub_index = [meta_[ (~meta_['CLASS']) & (meta_['AUTHOR']==auth) ].index.tolist() for auth in rh_auth_array]
    rh_sub_index = [index_ for index_list in rh_sub_index
                    for index_ in np.random.choice( index_list, min( thresh, len(index_list) ), replace = False ) ]

    return meta_.loc[np_sub_index + rh_sub_index]

def bootstrap_set(meta_):
    
    meta_ = author_max(meta_)
    class_size = meta_[meta_['CLASS']].shape[0]
    
    sample_np_index = meta_[meta_['CLASS']].sample(class_size, replace = True).index.tolist()
    sample_np_auths = meta_.loc[sample_np_index]['AUTHOR'].unique()
    
    sample_rh_index = meta_[~meta_['CLASS']].sample(class_size, replace = True).index.tolist()
    sample_rh_auths = meta_.loc[sample_rh_index]['AUTHOR'].unique()
    
    sample_all_auths = list(sample_np_auths) + list(sample_rh_auths)
    
    oos_np_index = meta_[( meta_['CLASS']) & (~meta_['AUTHOR'].isin(sample_all_auths))].index.tolist()
    oos_rh_index = meta_[(~meta_['CLASS']) & (~meta_['AUTHOR'].isin(sample_all_auths))].index.tolist()
    
    oos_rh_index = list(np.random.choice(oos_rh_index, size = len(oos_np_index), replace = False))
    
    return sample_np_index + sample_rh_index, oos_np_index + oos_rh_index

In [None]:
# From CV, set optimal parameters

opt_vocab = 5000
opt_reg = 0.001

In [None]:
boot_iters = 2000

prediction_by_index = {index_:[] for index_ in meta_df.index.tolist()}
f1s = []
coefs, coef_count = {}, {}

for k in range(boot_iters):
    sample_index, oos_index = bootstrap_set(meta_df)
    sample_targets, oos_targets = meta_df.loc[sample_index]['CLASS'].astype(int), meta_df.loc[oos_index]['CLASS'].astype(int)
    sample_dtm, oos_dtm = dtm_df.loc[sample_index], dtm_df.loc[oos_index]

    sample_vocab = top_n_features(sample_dtm, top_n = opt_vocab)
    sample_dtm, oos_dtm = sample_dtm[sample_vocab], oos_dtm[sample_vocab]
    sample_dtm, oos_dtm = normalize(sample_dtm, oos_dtm)

    lr = LogisticRegression(C = opt_reg)
    lr.fit(sample_dtm, sample_targets)
    oos_predictions = lr.predict_proba(oos_dtm)[:,1]
    
    f1s.append( f1_score(oos_targets,oos_predictions > 0.5))
    
    for m,index_ in enumerate(oos_index):
        prediction_by_index[index_].append( oos_predictions[m] )
    
    for n,token in enumerate(sample_vocab):
        try:
            coef_count[token] += 1
            coefs[token][k] = lr.coef_[0][n]
        except:
            coef_count[token] = 1
            coefs[token] = [0] * boot_iters
            coefs[token][k] = lr.coef_[0][n]

In [None]:
# Data Analysis

def get_percentile(list_of_floats, pct):
    
    size = len(list_of_floats)
    if size > 0:
        list_of_floats = sorted(list_of_floats)
        return list_of_floats[int(pct*size)]
    else:
        return np.nan

In [None]:
# Accuracy Report

np.mean(f1s), get_percentile(f1s, 0.025), get_percentile(f1s, 0.975)

In [None]:
# Export Predictions

meta_df['Predicted Probability'] = [np.mean(prediction_by_index[index_]) for index_ in meta_df.index.tolist()]
meta_df['Lower (95%-Interval)'] = [get_percentile(prediction_by_index[index_], 0.025) for index_ in meta_df.index.tolist()]
meta_df['Upper (95%-Interval)'] = [get_percentile(prediction_by_index[index_], 0.975) for index_ in meta_df.index.tolist()]

meta_df.to_csv('baseline-predictions.csv', index = False)

In [None]:
coef_df = pd.DataFrame()
token_list = [k for k,v in sorted(coef_count.items(), key = lambda item: item[1], reverse = True)[:opt_vocab]]

coef_df['Token'] = token_list
coef_df['Regression Coefficient'] = [ np.mean(coefs[key_]) for key_ in token_list ]
coef_df['Lower (95%-Interval)'] = [ get_percentile(coefs[key_], 0.025) for key_ in token_list ]
coef_df['Upper (95%-Interval)'] = [ get_percentile(coefs[key_], 0.975) for key_ in token_list ]

coef_df.sort_values('Regression Coefficient', ascending = False, inplace = True)
coef_df.to_csv('baseline-model.csv', index = False)