# Install Packages & Data

## We use Python 3.13.1.

If you are having dependency conflicts, install python 3.13.1 and select a newly created virtual environment for the Python kernel.

In [None]:
%pip install numpy pandas scikit-learn seaborn matplotlib nltk 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Unpack prepared data
import shutil
shutil.unpack_archive("raw.zip", "raw")

In [132]:
import numpy as np
import pandas as pd
import os
import collections
import re

# Burrows Delta Model

### The web scraper, data preprocessing, and model training takes several hours to run to completion. We provide this streamlined notebook for the grader to train & run a small version of our model.

In [3]:
class BurrowsDelta():
    def __init__(self, N, normalize = None):
        # number words in feature vocab 
        self.N = N
        # simple token preprocessing function to investigate effectiveness of fast to compute heursistics 
        if normalize is not None:
            self.normalize = normalize

    # stream tokens of a book from a src file using a generator
    def stream_book(self, src):
        with open(src, "r") as f:
            for l in f:
                for t in l.split():
                    yield t
    
    # if the book's text is already loaded in memory we can just return the tokens with a generator
    def load_book(self, src):
        for t in src.split():
            yield t
    
    # helper function to decide whether to stream or load
    def read_book(self, src):
        if os.path.isfile(src):
            return self.stream_book(src)
        return self.load_book(src)

    # fit the burrows delta to the record's dataframe  
    def fit(self, records):
        # create copy of records to prevent propogation of mutations
        X = records \
            .copy() \
            .reset_index(drop = True)

        # count the frequency of words at the collection and per document leevel.
        C = collections.Counter()
        DF = {}
        for r in X.itertuples(index = False):
            c = collections.Counter()
            for t in self.read_book(r.src):
                t = self.normalize(t)
                c[t] += 1
                C[t] += 1
            DF[r.id] = c
        
        # select feature vocab as N most frequent words in collection
        self.F = [ w for w, _ in C.most_common(self.N) ]
        
        # compute the document feature vectors
        DV = []
        for r in X.itertuples(index = False):
            c = DF[r.id]
            dv = np.array([ c[w] for w in self.F ]) / c.total()
            DV.append(dv)
        X["dv"] = DV
        # compute the mean's and standard deviations of the document features, stacked as a matrix
        FM = np.stack(DV)
        self.m = FM.mean(axis = 0)
        self.s = FM.std(axis = 0, ddof = 1)
        # compute the z-transform of the document vectors.
        X["z"] = X.dv.apply(self.Z)

        # compute the author centroids stacked as a matirx
        self.A = X.groupby("author").z.mean()
        self.AM = np.stack(self.A)

    # Z-transform for measuring shapes of distributions
    def Z(self, x):
        return (x - self.m) / self.s
    
    # compute feature vector for new text
    def fv(self, text):
        c = collections.Counter(text.split())
        return np.array([ c[w] for w in self.F ]) / c.total()

    # default burrow's metric
    def burrows(self, fv):
        z = self.Z(fv)
        return np.abs(self.AM - z).mean(axis = 1)

    # make a prediction by taking the author with the minimum delta metric to a text 
    def predict(self, text, delta = None):
        if delta is None:
            delta = self.burrows
        fv = self.fv(text)
        D = delta(fv)
        idx = np.argmin(D)
        return self.A.index[idx]

In [None]:
PROJECT_DIR = os.path.dirname(os.path.abspath(""))
DATA_DIR = os.path.join(PROJECT_DIR, "data")

def record_book_path(r):
    return os.path.join("raw", f"{r["author"]}_{r["id"]}.txt")

# load records and add column for file source to stream text
records = pd.read_csv(os.path.join(DATA_DIR, "records.csv"), on_bad_lines = "warn")
records["src"] = records.apply(record_book_path, axis = 1)

## Our final report explains why no preprocessing in used here--in short it turns out model performance is negatively affected by preprocessing because it removes properties of linguistic style, e.g. casing.

## The streamlined notebook uses a small feature set to show the robustness of the overall model formulation.

In [None]:
# hyper parameters
NUM_FEATURES = 500 # try 30,000
TRAIN_SPLIT_PROPORTION = .9

# test train split
X = records.sample(frac = TRAIN_SPLIT_PROPORTION)
y = records.drop(X.index)

# fit model on raw unprocessed book texts
model = BurrowsDelta(N = NUM_FEATURES)
model.fit(X)

In [None]:
# helper function testing model performance under a delta function
def evaluate_accuracy(model, test, delta = None):
    correct = 0
    for r in test.itertuples(index = False):
        with open(r.src, "r") as f:
            pred = model.predict(f.read(), delta)
        correct += r.author == pred
    return correct / len(test)

# procedure to evaluate a model accuracy under different metric functions
def testing_procedure(model, test):
    def cosine_delta(fv):
        z = model.Z(fv)
        return 1 - model.AM.dot(z) / (np.linalg.norm(model.AM, axis = 1) * np.linalg.norm(z))

    deltas = [
        model.burrows, # default  
        cosine_delta 
    ]

    print(f"{"random chance: ":<30} accuracy = {1 / len(model.A):.4f}")

    for delta in deltas:
        name = delta.__name__
        accuracy = evaluate_accuracy(model, test, delta)
        print(f"{name:<30} accuracy =  {accuracy:.4f}")

In [None]:
# evaluate the model accuracy
testing_procedure(model, y)

In [None]:
# investigate the feature count hyperparameter for the standard burrows
for n in range(10, 510 + 1, 50):
    model = BurrowsDelta(N = n)
    model.fit(X)
    accuracy = evaluate_accuracy(model, y)
    print(f"N = {n}, accuracy = {accuracy:.4f}")