In [None]:
!pip install datasets  # Install the correct versions



In [None]:
# Imports and Environment Setup

# Standard Imports
import os   #FIle and enviorment
import json # Reading Json
import time # Time
from datetime import datetime

#Pytorch and Hugging Face
import torch
import logging
from transformers import (
    DistilBertTokenizerFast,   # Fast Tokenizer
    DistilBertForMultipleChoice,  # Model for mutiple-choice QA
    Trainer,  # Trainer API
    TrainingArguments,  # Arugment Contrainer
    TrainerCallback,  # Custome Call Backs
    DataCollatorForMultipleChoice # Handls batching for inputs
)

#Hugging Face Data sets
from datasets import Dataset

#Evaluation
from sklearn.metrics import accuracy_score

# Colab-Specific
from google.colab import drive

# Progress Bars
from tqdm import tqdm

# CUDA + Logging setup
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Forces CUDA to throw detailed traced for GPU Errors
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Supresses all Hugging face warning

drive.mount('/content/drive', force_remount=True)

# Log Directory setup
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = f"/content/drive/MyDrive/training_logs_{timestamp}"
os.makedirs(log_dir, exist_ok=True)
summary_log_path = os.path.join(log_dir, "summary.log")

print(f"Logs and checkpoints will be saved in: {log_dir}")




Mounted at /content/drive
Logs and checkpoints will be saved in: /content/drive/MyDrive/training_logs_20250507_062203


In [None]:
# Helper Functions

# Purpose
# - load and filter cloze-style questions data for HF training
# - Enforcce contraints around canidate visibility and length
# - convers JSON lines into HF Data set


def load_hf_data(json_path, max_samples=None, truncate=True, max_tokens=384):

    # Load all lines
    with open(json_path, 'r') as f:
        data = [json.loads(line.strip()) for line in f]

    valid = []   # List of usable
    total_seen = 0  # Total read
    excluded_total = 0  # Total examples excluded
    excluded_few_visible = 0 # Specifically excuded for too few canidates

    for ex in data:
        total_seen += 1
        context = ex["context"]
        question = ex["question"]
        correct = ex["label"]

        # Extract all unique entite marker from article
        candidates = sorted({w for w in context.split() if w.startswith("@entity")})

        # Filter 1: Skip if correct label not in candidates
        if correct not in candidates:
            excluded_total += 1
            continue

        # Truncate around the correct answer
        if truncate:
            correct_idx = context.find(correct)
            if correct_idx != -1:
                context_window = context[max(0, correct_idx - 250):correct_idx + 250]
            else:
                context_window = context[:600]
        else:
            context_window = context

        # Approximate token limit check
        approx_tokens = len(context_window) // 1.3 + len(question) // 1.3
        if truncate and approx_tokens > max_tokens:
            excluded_total += 1
            continue

        # Filter 2: require at least 5 visible candidates
        visible_entities = [e for e in candidates if e in context_window]
        if len(visible_entities) < 5:
            excluded_total += 1
            excluded_few_visible += 1
            continue

        # Append validated and clean samples
        valid.append({
            "context": context_window,
            "question": question,
            "candidates": candidates,
            "label": candidates.index(correct)
        })

        if max_samples and len(valid) >= max_samples:
            break

    # Logging and Filtering Summary
    pct_total = (excluded_total / float(total_seen)) * 100 if total_seen else 0
    pct_visible = (excluded_few_visible / float(total_seen)) * 100 if total_seen else 0

    log_msg = (
        f"[FILTERING] Loaded {len(valid)} / {total_seen} examples "
        f"({excluded_total} excluded, {pct_total:.2f}% filtered total — "
        f"{excluded_few_visible} excluded for <5 visible entities, {pct_visible:.2f}%)"
    )
    print(log_msg)
    with open(summary_log_path, "a") as f:
        f.write(log_msg + "\n")

    return Dataset.from_list(valid)



In [None]:
# Preprocessing Function

# Purpose:
# Format a single QA Example, into a structure for DistilBert Mutiple choice

def format_example(example, tokenizer, max_len=384):
    context = example["context"]
    question = example["question"]
    candidates = example["candidates"]
    label = example["label"]  # Index of the correct answer

    try:
        correct_entity = candidates[label]
        context_words = context.split()
        answer_pos = context_words.index(correct_entity)
    except (ValueError, IndexError):
        return None

    # Truncate context around the center of correct answer
    window = 100
    start = max(0, answer_pos - window)
    end = min(len(context_words), answer_pos + window)
    truncated_context = " ".join(context_words[start:end])

    # Tokenizizes
    inputs = tokenizer(
        [f"[Q] {question} [SEP] {cand}" for cand in candidates], # Pair each canidate with it question
        [truncated_context] * len(candidates),
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="pt"
    )

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": torch.tensor(label)
    }


# Purpose:
# Handles padding of variable length mutiple choice
# Ensures all sames in batch have the same number of choices

class DynamicDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):
        def to_tensor(x):
            return torch.tensor(x) if isinstance(x, list) else x

        # Extract each field across batch
        input_ids = [torch.stack([to_tensor(i) for i in f["input_ids"]]) for f in features]
        attention_mask = [torch.stack([to_tensor(m) for m in f["attention_mask"]]) for f in features]
        labels = torch.tensor([f["labels"] for f in features])

        # Find the max number of choices in batch
        max_choices = max(x.shape[0] for x in input_ids)

        # Padd all examples to max_choices
        padded_inputs = []
        padded_masks = []

        for ids, masks in zip(input_ids, attention_mask):
            pad_len = max_choices - ids.shape[0]
            padded_inputs.append(torch.cat([ids, torch.zeros((pad_len, ids.shape[1]), dtype=torch.long)]))
            padded_masks.append(torch.cat([masks, torch.zeros((pad_len, masks.shape[1]), dtype=torch.long)]))

        return {
            "input_ids": torch.stack(padded_inputs),
            "attention_mask": torch.stack(padded_masks),
            "labels": labels
        }


In [None]:
# Metrics and Evaluation

# Purpose:
# Coverts Logits into predictions
# Compute Accuracy between predicted and true lables


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)

    # Sanity Check to make sure everything has expected shapes
    try:
        assert logits.ndim == 2 and labels.ndim == 1  # Batch choices and Labels Batch
        assert logits.shape[0] == labels.shape[0]     # one Label per prediction
        assert labels.max().item() < logits.shape[1]  # Label muist be within range
    except AssertionError as e:
        print("[ASSERT ERROR]", e)
        raise

    preds = torch.argmax(logits, dim=1)   # Choose canidate with highest logits
    acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy()) # COmpute Accuracy
    return {"accuracy": acc}



# Custom Call back for Trainer

# Purpose:
# Adds detailed logginf for training progress
# Logs Intermediate evaluation metics mid epoch
# Tracks and saves best performing based on validation accuracy

class CustomLoggerCallback(TrainerCallback):
    def __init__(self, eval_dataset, tokenizer, eval_interval=0.33):
        self.eval_dataset = eval_dataset   # Evaluation for intermidate steps
        self.tokenizer = tokenizer         # Tokenizer for saving with Model
        self.global_step = 0               # Step Counter
        self.trainer = None
        self.best_acc = 0.0                # Best Accuracy through Validation
        self.epoch_start_time = None
        self.eval_interval = eval_interval # When to rigger mide epoch evalation
        self.train_loss_buffer = []

    def _log(self, message):
        # Keeps formating with previous logs
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        msg = f"[{timestamp}] {message}"
        print(msg)
        with open(summary_log_path, "a") as f:
            f.write(msg + "\n")
        logging.info(msg)

    def on_train_begin(self, args, state, control, **kwargs):
        self._log("Training started.")

    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_start_time = time.time()
        self.train_loss_buffer = []
        self._log(f"[EPOCH] Starting epoch {state.epoch:.2f}")

    def on_step_end(self, args, state, control, logs=None, **kwargs):
        self.global_step += 1

        # Try to collect train loss
        try:
            if logs is not None and "loss" in logs and logs["loss"] is not None:
                self.train_loss_buffer.append(float(logs["loss"]))
        except Exception as e:
            print(f"[WARN] Could not log train loss at step {self.global_step}: {e}")

        steps_per_epoch = len(self.trainer.get_train_dataloader())
        interval_steps = int(steps_per_epoch * self.eval_interval)

        if interval_steps and self.global_step % interval_steps == 0:
            avg_loss = sum(self.train_loss_buffer) / len(self.train_loss_buffer) if self.train_loss_buffer else -1
            self._log(f"[STEP {self.global_step}] Avg Train Loss: {avg_loss:.4f}")
            self.train_loss_buffer = []
            self._log(f"[INTERMEDIATE EVAL] Step {self.global_step}")
            self._run_eval()

    # Always run eval at end of epoch
    def on_epoch_end(self, args, state, control, **kwargs):
        self._run_eval()
        if self.epoch_start_time:
            elapsed = time.time() - self.epoch_start_time
            self._log(f"[EPOCH] Time: {elapsed/60:.2f} mins")
            self._log(f"[EPOCH END] Best Accuracy So Far: {self.best_acc:.4f}")

    def _run_eval(self):
        if not self.trainer:
            return
        metrics = self.trainer.evaluate(eval_dataset=self.eval_dataset)
        acc = metrics.get("eval_accuracy", -1)
        loss = metrics.get("eval_loss", -1)
        self._log(f"[EVAL] Accuracy: {acc:.4f} | Loss: {loss:.4f}")

        # if accuracy improved saved the model
        if acc > self.best_acc:
            self.best_acc = acc
            path = os.path.join(log_dir, "best_model")
            self.trainer.save_model(path)
            self.tokenizer.save_pretrained(path)
            self._log(f"[SAVE] New Best Accuracy: {acc:.4f} — model saved to {path}")

    def on_train_end(self, args, state, control, **kwargs):
        self._log("🏁 Training complete.")
        self._log(f"[FINAL LOGGED] Best Accuracy: {self.best_acc:.4f}")


In [None]:
# EXECUTION CONFIG AND TRAINING

# Ask user which dataset to run
print("Choose dataset: 'cnn' or 'dailymail'")
selected = input().strip().lower()
assert selected in ("cnn", "dailymail"), "Invalid choice"

#constructs paths based on selection
base_path = f"/content/drive/MyDrive/Capstone File/Data files/Modern {'CNN Data' if selected == 'cnn' else 'Dailymail Data'}"
train_path = os.path.join(base_path, "train.80k.hf.json")
val_path = os.path.join(base_path, "validation.hf.json")

# Ask for dataset size cap ( for debugging runs )
print("Max number of examples to load (e.g., 1000 or 'all')?")
max_count = input().strip()
max_count = None if max_count == "all" else int(max_count)

# Load and filter datasets using helper function
print("Loading datasets...")
train_raw = load_hf_data(train_path, max_samples=max_count)
val_raw = load_hf_data(val_path, max_samples=max_count)

# Load Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenization + Safe Mapping
# Purpose:
# Format and Token Dataset examples
# Drop Malformed if any
# Avoid crashes during formating

def safe_map(dataset):
    dropped = 0
    formatted_rows = []

    def try_format(example):
        nonlocal dropped
        try:
            formatted = format_example(example, tokenizer)
            if formatted is not None:
                formatted_rows.append(formatted)
            else:
                dropped += 1
        except Exception:
            dropped += 1

    for example in dataset:
        try_format(example)

    print(f"Dropped due to formatting/tokenization: {dropped} examples")
    return Dataset.from_list(formatted_rows)

# Apply safe formmating to examples
train_ds = safe_map(train_raw)
val_ds = safe_map(val_raw)

print(f"Training set size: {len(train_ds)} | Validation set size: {len(val_ds)}")

# MODEL AND TRAINER SETUP


# Load DistillBERT model for mutiple choice QA
model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-uncased")

# Custome Data Collator that handles variable length batches
collator = DynamicDataCollator(tokenizer)

# Call back for logging, evaluation and model saving due to colab limitations
callback = CustomLoggerCallback(eval_dataset=val_ds, tokenizer=tokenizer)

# How the trainning will behave
args = TrainingArguments(
    output_dir=os.path.join(log_dir, "checkpoints"), # where to save
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    logging_dir=log_dir,
    logging_steps=10000,
    save_strategy="epoch", # Save at end of epoch
    save_total_limit=1,    # only save the most recent
    report_to="none",      # No outside integration
    fp16=True,             # Use mixed precision for faster training
    gradient_accumulation_steps=1,
)

# Trainer object
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[callback],
    data_collator=collator
)

# Create trainer to the call back so it can eval and save
callback.trainer = trainer


# RUN TRAINING

print(f"Starting training on: {selected.upper()} | Samples: {len(train_ds)}")
trainer.train()

# Final performaance after training completes and log it
print("Final Evaluation...")
results = trainer.evaluate(eval_dataset=val_ds)
print("Final Eval Accuracy:", results.get("eval_accuracy", -1))
print(f"Best models and logs saved under: {log_dir}")


Choose dataset: 'cnn' or 'dailymail'
cnn
Max number of examples to load (e.g., 1000 or 'all')?
1000
Loading datasets...
[FILTERING] Loaded 1000 / 3143 examples (2143 excluded, 68.18% filtered total — 572 excluded for <5 visible entities, 18.20%)
[FILTERING] Loaded 1000 / 2846 examples (1846 excluded, 64.86% filtered total — 433 excluded for <5 visible entities, 15.21%)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Dropped due to formatting/tokenization: 0 examples
Dropped due to formatting/tokenization: 0 examples
Training set size: 1000 | Validation set size: 1000


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForMultipleChoice were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training on: CNN | Samples: 1000
[2025-05-07 06:25:13] Training started.
[2025-05-07 06:25:13] [EPOCH] Starting epoch 0.00


Step,Training Loss,Validation Loss,Accuracy
330,No log,2.776113,0.309
660,No log,2.005363,0.391
990,No log,1.321793,0.546
1000,No log,1.465895,0.576
1320,No log,1.829397,0.611
1650,No log,1.541703,0.663
1980,No log,1.595614,0.634
2000,No log,1.668392,0.661
2310,No log,1.499104,0.699
2640,No log,1.476245,0.718


[2025-05-07 06:25:42] [STEP 330] Avg Train Loss: -1.0000
[2025-05-07 06:25:42] [INTERMEDIATE EVAL] Step 330
[2025-05-07 06:26:17] [EVAL] Accuracy: 0.3090 | Loss: 2.7761
[2025-05-07 06:26:17] [SAVE] New Best Accuracy: 0.3090 — model saved to /content/drive/MyDrive/training_logs_20250507_062203/best_model
[2025-05-07 06:26:45] [STEP 660] Avg Train Loss: -1.0000
[2025-05-07 06:26:45] [INTERMEDIATE EVAL] Step 660
[2025-05-07 06:27:19] [EVAL] Accuracy: 0.3910 | Loss: 2.0054
[2025-05-07 06:27:20] [SAVE] New Best Accuracy: 0.3910 — model saved to /content/drive/MyDrive/training_logs_20250507_062203/best_model
[2025-05-07 06:27:48] [STEP 990] Avg Train Loss: -1.0000
[2025-05-07 06:27:48] [INTERMEDIATE EVAL] Step 990
[2025-05-07 06:28:23] [EVAL] Accuracy: 0.5460 | Loss: 1.3218
[2025-05-07 06:28:24] [SAVE] New Best Accuracy: 0.5460 — model saved to /content/drive/MyDrive/training_logs_20250507_062203/best_model
[2025-05-07 06:28:59] [EVAL] Accuracy: 0.5760 | Loss: 1.4659
[2025-05-07 06:29:00] [S

Final Eval Accuracy: 0.705
Best models and logs saved under: /content/drive/MyDrive/training_logs_20250507_062203


In [None]:
# FINAL TEST EVALUATION
# Purpose
# Evaluate best model on holdout test set
# Log results conistent with prior training

# Reuse consistent logging function
def log_message(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    msg = f"[{timestamp}] {message}"
    print(msg)
    with open(summary_log_path, "a") as f:
        f.write(msg + "\n")

#Load the dataset
log_message("Loading and evaluating on test set...")
test_path = os.path.join(base_path, "test.hf.json")
test_raw = load_hf_data(test_path)
test_ds = safe_map(test_raw)

# Reload best model + tokenizer
best_model_path = os.path.join(log_dir, "best_model")
model = DistilBertForMultipleChoice.from_pretrained(best_model_path)
tokenizer = DistilBertTokenizerFast.from_pretrained(best_model_path)

# Swap in best model and evaluate on correct device
trainer.model = model.to(trainer.args.device)
results = trainer.evaluate(eval_dataset=test_ds)

# Log results in standard format
acc = results.get("eval_accuracy", -1)
loss = results.get("eval_loss", -1)
log_message(f"[FINAL TEST EVAL] Accuracy: {acc:.4f} | Loss: {loss:.4f}")


[2025-05-06 06:27:05] Loading and evaluating on test set...
[FILTERING] Loaded 1007 / 3198 examples (2191 excluded, 68.51% filtered total — 605 excluded for <5 visible entities, 18.92%)
Dropped due to formatting/tokenization: 0 examples


[2025-05-06 06:28:28] [FINAL TEST EVAL] Accuracy: 0.7120 | Loss: 1.6096


In [None]:

# Entity Filtering Summary

# Purpose:
#  Analyze entity candidate counts before and after filtering
#  Compare filtering effects across CNN and DailyMail datasets
#  Save a summary table of statistics for reporting

import os
import json
import pandas as pd
import numpy as np

# Paths
base_dir = "/content/drive/MyDrive/CapstoneProjectDataStrat"
cnn_paths = {
    "Train": os.path.join(base_dir, "cnn/train.80k.hf.json"),
    "Validation": os.path.join(base_dir, "cnn/validation.hf.json"),
    "Test": os.path.join(base_dir, "cnn/test.hf.json"),
}
dm_paths = {
    "Train": os.path.join(base_dir, "dailymail/train.80k.hf.json"),
    "Validation": os.path.join(base_dir, "dailymail/validation.hf.json"),
    "Test": os.path.join(base_dir, "dailymail/test.hf.json"),
}

# Filtering Function
# Purpose:
#  Uses same filtering logic
#  Determin if sample pases filters

def apply_filters(context, question, label, max_tokens=384, min_visible=5):
    candidates = sorted({w for w in context.split() if w.startswith("@entity")})
    if label not in candidates:
        return False, 0
    index = context.find(label)
    window = context[max(0, index - 250):index + 250] if index != -1 else context[:600]
    approx_tokens = len(window) // 1.3 + len(question) // 1.3
    if approx_tokens > max_tokens:
        return False, 0
    visible = [e for e in candidates if e in window]
    return len(visible) >= min_visible, len(visible)

# Stats Collection
# Purpose:
# For each sample count total vs filtered
# Track for comparison table

def collect_entity_stats(filepath):
    raw_counts, filtered_counts = [], []
    with open(filepath, 'r') as file:
        for line in file:
            try:
                item = json.loads(line.strip())
                context = item["context"]
                question = item["question"]
                label = item["label"]
                raw_total = len({w for w in context.split() if w.startswith("@entity")})
                raw_counts.append(raw_total)
                passed, filtered_count = apply_filters(context, question, label)
                if passed:
                    filtered_counts.append(filtered_count)
            except Exception:
                continue
    return raw_counts, filtered_counts

# Summary Table Construction
# Purpose:
# Generate descrive Stattics across splits
# Compare raw vs filtered

summary_rows = []

for dataset_name, path_dict in [("CNN", cnn_paths), ("DailyMail", dm_paths)]:
    for split_name, file_path in path_dict.items():
        if not os.path.exists(file_path):
            print(f"Missing file: {file_path}")
            continue
        raw, filt = collect_entity_stats(file_path)
        if raw and filt:
            summary_rows.append({
                "Dataset": dataset_name,
                "Split": split_name,
                "Raw Count": len(raw),
                "Filtered Count": len(filt),
                "Raw Min": min(raw),
                "Raw Max": max(raw),
                "Filtered Min": min(filt),
                "Filtered Max": max(filt),
                "Raw Mean": round(np.mean(raw), 2),
                "Filtered Mean": round(np.mean(filt), 2),
                "Raw Median": round(np.median(raw), 2),
                "Filtered Median": round(np.median(filt), 2),
                "Reduction (Mean)": round(np.mean(raw) - np.mean(filt), 2),
                "% Reduction (Mean)": round(100 * (np.mean(raw) - np.mean(filt)) / np.mean(raw), 2),
            })

# Save to CSV
summary_df = pd.DataFrame(summary_rows)
output_csv_path = os.path.join(base_dir, "entity_filtering_summary.csv")
summary_df.to_csv(output_csv_path, index=False)
print(f"Saved summary to: {output_csv_path}")


In [None]:
print(summary_df)

     Dataset       Split  Raw Count  Filtered Count  Raw Min  Raw Max  \
0        CNN       Train      80000           25256        1      527   
1        CNN  Validation       3924            1376        2      187   
2        CNN        Test       3198            1007        4      394   
3  DailyMail       Train      80000           16019        1      329   
4  DailyMail  Validation      64835           16424        1      230   
5  DailyMail        Test      53182           12731        1      245   

   Filtered Min  Filtered Max  Raw Mean  Filtered Mean  Raw Median  \
0             5            30     26.39           6.48        23.0   
1             5            15     26.19           6.40        22.0   
2             5            17     24.22           6.34        21.0   
3             5            28     26.25           6.54        22.0   
4             5            29     25.02           6.50        21.0   
5             5            24     25.48           6.47        21.0  