# importing required libraries

In [7]:

import torch
import os
import pyarrow.parquet as pa
import pickle
from tqdm import tqdm


# text stuff
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from tokenizers import Tokenizer
from transformers import AutoTokenizer

# mathy stuff
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# change directory if necessary
os.getcwd()
# os.chdir("../../")

'/Users/jiayilow/Desktop/CASA/Dissertation/mapping_reform'

# loading in NCSE v2.0 data
this data is downloaded from https://rdr.ucl.ac.uk/articles/dataset/NCSE_v2_0_A_Dataset_of_OCR-Processed_19th_Century_English_Newspapers/28381610.

In [None]:
# note: change file path if necessary

parquet_files = {
    "The English Woman's Journal": "data/ncse_raw/English_Womans_Journal_issue_PDF_files.parquet",
    "The Tomahawk": "data/ncse_raw/Tomahawk_issue_PDF_files.parquet",
    "The Leader": "data/ncse_raw/Leader_issue_PDF_files.parquet",
    "The Monthly Repository": "data/ncse_raw/Monthly_Repository_issue_PDF_files.parquet",
    "The Northern Star": "data/ncse_raw/Northern_Star_issue_PDF_files.parquet",
    "The Publisher's Circular": "data/ncse_raw/Publishers_Circular_issue_PDF_files.parquet"
}


# setup and functions for text pre-processing

In [8]:

min_words = 5     # setting min words
token_limit = 384 # mpnet truncates text past 384 tokens by default
model = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model)

# %%
# whole buncha functions for preprocessing

def fix_encoding_errors(text):
    """Fix common UTF-8 misencoding issues."""
    replacements = {
        '‚Äî': '—',
        '‚Äú': '“',
        '‚Äù': '”',
        'â€œ': '“',
        'â€\x9d': '”',
        'â€˜': '‘',
        'â€™': '’',
        'â€“': '–',
        '\n': ' ',
    }
    for wrong, right in replacements.items():
        text = text.replace(wrong, right)
    return text
    
def split_by_token_limit(text, token_limit=384):
    """Split long text into chunks within the token limit using tokenizer."""
    
    # input validation
    if not text or not text.strip():
        return []

    # tokenising
    tokens = tokenizer(text, padding=False, truncation=False, return_offsets_mapping=True)
    input_ids = tokens["input_ids"]
    offsets = tokens["offset_mapping"]

    # Handle edge cases
    if len(input_ids) <= token_limit:
        return [text.strip()]

    chunks = []
    start = 0
    
    while start < len(input_ids):
        end = min(start + token_limit, len(input_ids)) # the smaller of the 2
        
        # Get the start/end char positions of this token span
        start_char = offsets[start][0] if start < len(offsets) else len(text) # ensuring starting token index exists within bounds, otherwise fallback to end of text
        end_char = offsets[end - 1][1] if end <= len(offsets) else len(text) # ensuring ending token index exists within bounds, otherwise fallback to end of text

         # Ensure we don't go beyond text boundaries
        start_char = max(0, min(start_char, len(text)))
        end_char = max(start_char, min(end_char, len(text)))

        
        chunk = text[start_char:end_char].strip()

        # keeping only non-empty chunks
        if chunk:
            chunks.append(chunk)
        start = end

    return chunks

def clean_df(df_raw, 
             publication_name='unknown_publication', 
             output_path='preprocessed_articles.parquet',
            min_words=5,
            token_limit=384):
    
    # 1. Filter only ‘text’ rows
    df = df_raw[df_raw['class'] == 'text'][['issue_id', 'content']].copy()

    # 2. Clean UTF-8/regex artifacts
    df['content'] = df['content'].apply(fix_encoding_errors)

    # 3. Keep only articles with more than min_words
    content_lens = df['content'].str.count(r'\w+')
    short_articles = df[content_lens < min_words]
    df = df[content_lens >= min_words]
    print(f"{len(short_articles)} articles dropped - fewer than {min_words} words.")

    # 4. Drop duplicate content
    pre_duplicatedrop = len(df)
    df = df.drop_duplicates(subset='content')
    post_duplicatedrop = len(df)
    print(f"{pre_duplicatedrop - post_duplicatedrop} duplicate articles removed.")

    # 5. Split by token count
    split_articles = []
    problematic_articles = []
    for idx, row in df.iterrows():
        try:
            chunks = split_by_token_limit(row['content'], token_limit)
            for chunk in chunks:
                split_articles.append({
                    'issue_id': row['issue_id'],
                    'content': chunk
                })
        except Exception as e:
            print(f"Error processing article {idx}: {e}")
            problematic_articles.append(idx)
            continue
                

    df_split = pd.DataFrame(split_articles)

    # 6. Create column from date published
    # i found that there's a misformatted date, '1852-31-07', in The Leader's file. fixing this below:
    df_split['issue_id'] = df_split['issue_id'].str.replace('1852-31-07', '1852-07-31', regex=False)

    df_split['pub_date'] = pd.to_datetime(
        df_split['issue_id'].str.extract(r'(\d{4}-\d{2}-\d{2})')[0],
        format='%Y-%m-%d', errors='coerce'
    )

    # 7. Add publication column
    df_split['publication'] = publication_name

    # Save as Parquet
    df_split.to_parquet(output_path, index=False)
    print(f"Saved {len(df_split)} preprocessed articles to {output_path}.")

    return df_split.reset_index(drop=True), short_articles.reset_index(drop=True)


# preprocessing publications and saving them locally

In [10]:
output_folder = "output_data/preprocessed_data/"
if not os.path.exists(output_folder):
    print(f"Creating directory to store preprocessed articles: {output_folder}")
    os.makedirs(output_folder)
else:
    print("Directory to store preprocessed articles already exists.")
    
    
# dict to store preprocessed publications
cleaned_dfs = {}
short_articles_dfs = {}

for pub_name, path in parquet_files.items():
    flat_name = pub_name.lower().replace(" ", "_").replace("'", "")
    output_path = f"{output_folder}{flat_name}_preprocessed.parquet"

    if os.path.exists(output_path):
        print(f"Preprocessed data file already exists for {pub_name}. Loading it...")
        cleaned_df = pa.read_table(output_path).to_pandas()
        cleaned_dfs[pub_name] = cleaned_df

    else: 
        print(f"Loading and processing: {pub_name}")

        # reading and process original data
        raw_df = pa.read_table(path).to_pandas()
        cleaned_df, short_df = clean_df(raw_df, publication_name = pub_name, output_path = output_path)

        cleaned_dfs[pub_name] = cleaned_df 
        short_articles_dfs[pub_name] = short_df


Directory to store preprocessed articles already exists.
Preprocessed data file already exists for The English Woman's Journal. Loading it...
Preprocessed data file already exists for The Tomahawk. Loading it...
Preprocessed data file already exists for The Leader. Loading it...
Preprocessed data file already exists for The Monthly Repository. Loading it...
Preprocessed data file already exists for The Northern Star. Loading it...
Preprocessed data file already exists for The Publisher's Circular. Loading it...


In [11]:

# assigning outputs to individual publication variables
ewj = cleaned_dfs["The English Woman's Journal"]
thawk = cleaned_dfs["The Tomahawk"]
leader = cleaned_dfs["The Leader"]
monrepo = cleaned_dfs["The Monthly Repository"]
star = cleaned_dfs["The Northern Star"]
circ = cleaned_dfs["The Publisher's Circular"]

# generating and saving embeddings

In [12]:

# directory to store embeddings 
embedding_dir = "output_data/article_embeddings/embeddings_mpnet"

if not os.path.exists(embedding_dir):
    print(f"Creating directory to store embeddings: {embedding_dir}")
    os.makedirs(embedding_dir)
else:
    print(f"Directory for embeddings already exists: {embedding_dir}")


for pub, pub_df in cleaned_dfs.items():
    filename = pub.lower().replace(" ", "_").replace("'", "").replace(".","") + ".pkl"
    filepath = os.path.join(embedding_dir, filename)
    if not os.path.exists(filepath):
        print(f"Generating vector embeddings for {pub}...")
        pub_text = pub_df["content"].tolist()
        pub_date = pub_df["pub_date"].tolist()
        pub_vec = SentenceTransformer(model).encode(pub_text, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
        print(f"Vector array of shape {pub_vec.shape} obtained.")
        print(f"Saving embeddings for {pub} to {filepath}...")
        with open(filepath, "wb") as f:
            pickle.dump({
                "articles": pub_text, 
                "dates": pub_date,
                "embeddings": pub_vec},
                f)

    else:
        print(f"Pre-computed embeddings already exist for {pub}.")


Directory for embeddings already exists: output_data/article_embeddings/embeddings_mpnet
Pre-computed embeddings already exist for The English Woman's Journal.
Pre-computed embeddings already exist for The Tomahawk.
Pre-computed embeddings already exist for The Leader.
Pre-computed embeddings already exist for The Monthly Repository.
Pre-computed embeddings already exist for The Northern Star.
Pre-computed embeddings already exist for The Publisher's Circular.
