In [12]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import re

# get data
INPUT_FILE = "collection_stage_2.csv"  
TOP_N_WORDS = 500  # features
OUTPUT_FILE = "cleaned_data.csv"

with open(INPUT_FILE, "r", encoding="utf-8") as infile, open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
    header = infile.readline()  # read header
    outfile.write(header)  

    for line_num, line in enumerate(infile, start=2):
        parts = line.strip().split(",", 2)  # split only first 2 commas
        if len(parts) == 3:
            author, book, sequence = parts
            sequence_cleaned = sequence.replace(",", "")  # remove commas inside sequence
            outfile.write(f"{author},{book},{sequence_cleaned}\n")
        else:
            print(f"skip line {line_num}: {line.strip()}")

df = pd.read_csv("cleaned_data.csv")


# cleans
df = df[df['sequence'].notnull()]

# clean to read csv
def clean_text(text):
    text = text.replace(",", "")  # remove commas
    return text

df['sequence'] = df['sequence'].apply(clean_text)

def tokenize(text):
    return [w.lower() for w in text.split()]

# build vocab
all_tokens = []
for text in df['sequence']:
    all_tokens.extend(tokenize(text))

word_freq = Counter(all_tokens)
vocab = [word for word, _ in word_freq.most_common(TOP_N_WORDS)]
vocab_idx = {word: i for i, word in enumerate(vocab)}



# frequency vec tor
def extract_freq_vector(text, vocab_idx):
    tokens = tokenize(text)
    freqs = Counter(tokens)
    vec = np.zeros(len(vocab_idx))
    for word, count in freqs.items():
        if word in vocab_idx:
            vec[vocab_idx[word]] = count
    if vec.sum() > 0:
        vec /= vec.sum()  # normalization
    return vec
    
TOP_N_AUTHORS = 20

# contribtutions of each authr
author_counts = df['author'].value_counts()

# top N author (20)
top_authors = author_counts.nlargest(TOP_N_AUTHORS).index.tolist()

print(f"Selected top {TOP_N_AUTHORS} authors: {top_authors}")

# change df.less authors
df = df[df['author'].isin(top_authors)]

# split
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['author'], random_state=42)

# centroid
train_df['vector'] = train_df['sequence'].apply(lambda t: extract_freq_vector(t, vocab_idx))

author_centroids = {}
for author, group in train_df.groupby('author'):
    vectors = np.vstack(group['vector'])
    centroid = vectors.mean(axis=0)
    author_centroids[author] = centroid

# classifying
def classify(text):
    text_cleaned = clean_text(text)  # clean input text too
    vec = extract_freq_vector(text_cleaned, vocab_idx)
    sims = {}
    for author, centroid in author_centroids.items():
        similarity = cosine_similarity([vec], [centroid])[0][0]
        sims[author] = similarity
    predicted_author = max(sims, key=sims.get)
    return predicted_author

correct = 0
total = 0
#baseline accuracy is th word frequency centroid + cosine similarity.. NO burrows delta. NO text distort
for i, row in test_df.iterrows():
    pred_author = classify(row['sequence'])
    correct += (pred_author == row['author'])
    total += 1

accuracy = correct / total
print(f"\nbaseline accuracy: {accuracy:.4f}")
#feel free to edit

Selected top 20 authors: [9248, 898, 65, 59, 761, 37, 1961, 838, 54, 6202, 708, 2858, 30, 23, 34, 120, 79, 220, 7862, 125]
Test Doc 1468: True=54 | Pred=54
Test Doc 853: True=220 | Pred=898
Test Doc 229: True=2858 | Pred=23
Test Doc 925: True=59 | Pred=23
Test Doc 1437: True=9248 | Pred=34
Test Doc 659: True=125 | Pred=125
Test Doc 43: True=37 | Pred=23
Test Doc 899: True=708 | Pred=898
Test Doc 419: True=30 | Pred=125
Test Doc 1150: True=898 | Pred=898
Test Doc 1107: True=761 | Pred=898
Test Doc 418: True=30 | Pred=838
Test Doc 420: True=30 | Pred=120
Test Doc 846: True=220 | Pred=220
Test Doc 1435: True=9248 | Pred=898
Test Doc 1465: True=34 | Pred=34
Test Doc 6: True=65 | Pred=65
Test Doc 1313: True=7862 | Pred=7862
Test Doc 1451: True=34 | Pred=2858
Test Doc 45: True=37 | Pred=23
Test Doc 1153: True=898 | Pred=898
Test Doc 314: True=1961 | Pred=838
Test Doc 1097: True=761 | Pred=761
Test Doc 53: True=37 | Pred=37
Test Doc 1297: True=7862 | Pred=7862
Test Doc 1476: True=54 | Pred=54