

# Flan-T5 Fine-Tuning for Interiority (none / low / high)

This notebook is part of our INFO 256 (ANLP) final project.  



In [None]:
!pip install -q --upgrade transformers datasets sentencepiece accelerate scikit-learn

import transformers
import pandas as pd
import numpy as np
import torch

from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    precision_recall_fscore_support
)
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

all_results = []


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m55.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


In [None]:
def evaluate_interiority_model(y_true, y_pred, model_name="", model_type=""):
    """
    Computes full metrics in the table format.
    """
    labels = ["none", "low", "high"]

    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, zero_division=0
    )

    metrics = {
        "Type": model_type,
        "Model": model_name,
        "Accuracy": round(acc, 3),
        "Macro F1": round(macro_f1, 3),
        "None Precision/Recall": f"{precision[0]:.2f}/{recall[0]:.2f}",
        "Low Precision/Recall": f"{precision[1]:.2f}/{recall[1]:.2f}",
        "High Precision/Recall": f"{precision[2]:.2f}/{recall[2]:.2f}",
        "None F1": round(f1[0], 2),
        "Low F1": round(f1[1], 2),
        "High F1": round(f1[2], 2),
    }

    print("Accuracy:", acc)
    print("Macro F1:", macro_f1)
    print("\nDetailed classification report:")
    print(
        classification_report(
            y_true,
            y_pred,
            labels=labels,
            digits=3,
            zero_division=0
        )
    )

    return metrics


## Load and inspect the dataset

We use our final **interiority gold** dataset (≈600 passages), annotated with:

- `none` – no inner thoughts or feelings
- `low` – some but limited interiority
- `high` – sustained access to a character’s inner states

We standardize column names to `text` and `label` and do some basic cleaning.


In [None]:
# Path to the final dataset
DATA_PATH = "/content/drive/MyDrive/INFO 256: Applied Natural Language Processing/ANLP Final Project/Data/interiority_gold_final.csv"

# Load the dataset
df = pd.read_csv(DATA_PATH)

df = df.rename(columns={
    "paragraph": "text",      # original column with passage text
    "gold_label": "label"     # original column with label
})

# Basic cleaning
df["text"] = df["text"].astype(str).str.strip()
df["label"] = df["label"].astype(str).str.lower().str.strip()

# Drop rows with missing or empty text/labels
df = df.dropna(subset=["text", "label"])
df = df[df["text"].str.strip() != ""]

print("Dataset shape:", df.shape)
print("\nLabel distribution:")
print(df["label"].value_counts())
df.head()


Dataset shape: (597, 3)

Label distribution:
label
none    237
high    204
low     156
Name: count, dtype: int64


Unnamed: 0,title,text,label
0,A Room with a View,"“I have been a failure,” said Miss Bartlett, a...",high
1,A Room with a View,Lucy paused. “Cecil said one day—and I thought...,low
2,A Room with a View,"Miss Bartlett, who was poor at figures, became...",high
3,A Room with a View,An engagement is so potent a thing that sooner...,low
4,A Room with a View,“In the course of conversation they said that ...,low


## Turn examples into T5-style prompts

Flan-T5 expects an **instructional input** and a **short textual target**.

We convert each example into:

- **Input**: an instruction + passage + `Label:` prompt  
- **Target**: the label text (`none`, `low`, or `high`)

This keeps the task close to the model’s original instruction-tuning style.


In [None]:
def make_prompt(row):
    input_text = (
        "Classify the level of interiority in the following passage "
        "as one of: none, low, high.\n\n"
        f"Passage: {row['text']}\n\n"
        "Label:"
    )
    target_text = row["label"]
    return pd.Series({"input_text": input_text, "target_text": target_text})

df_prompt = df.apply(make_prompt, axis=1)


## 4. Train/test split

We split the data into:

- **80% train**
- **20% test**

We stratify by label so each split has a similar label distribution.


In [None]:
train_df, test_df = train_test_split(
    df_prompt,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})


In [None]:
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df.reset_index(drop=True))

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
dataset


DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 477
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 120
    })
})

## Tokenization and preprocessing

We tokenize:

- `input_text` with a max length (e.g., 256 tokens)
- `target_text` with a very short max length (the label word)

We store the label token IDs in the `labels` field expected by T5.


In [None]:
def run_finetune_experiment(model_name: str):
    """
    Fine-tune a Flan-T5 model (small/base) and evaluate on the test set.
    Returns metrics dict.
    """
    print("\n==============================")
    print(f"Fine-tuning {model_name}")
    print("==============================")

    tokenizer = T5Tokenizer.from_pretrained(model_name)

    MAX_INPUT_LENGTH = 256
    MAX_TARGET_LENGTH = 5

    def preprocess(batch):
        model_inputs = tokenizer(
            batch["input_text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_INPUT_LENGTH,
        )
        labels = tokenizer(
            text_target=batch["target_text"],
            truncation=True,
            padding="max_length",
            max_length=MAX_TARGET_LENGTH,
        )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized = dataset.map(
        preprocess,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )

    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

    training_args = TrainingArguments(
        output_dir=f"./{model_name.split('/')[-1]}-interiority",
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        learning_rate=3e-4,
        weight_decay=0.01,
        fp16=(device == "cuda"),
        eval_strategy="epoch",    # you already fixed eval_strategy
        save_strategy="no",
        logging_steps=20,
        report_to=[],
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized["train"],
        eval_dataset=tokenized["test"],
    )

    trainer.train()

    # ---- generation-based eval ----
    def normalize_label(text):
        text = text.strip().lower()
        if "high" in text:
            return "high"
        if "low" in text:
            return "low"
        if "none" in text or "no interiority" in text:
            return "none"
        return "none"

    model.eval()
    decoded_preds = []
    with torch.no_grad():
        for ex in tokenized["test"]:
            input_ids = torch.tensor(ex["input_ids"]).unsqueeze(0).to(device)
            attention_mask = torch.tensor(ex["attention_mask"]).unsqueeze(0).to(device)

            gen_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=10,
            )
            text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
            decoded_preds.append(normalize_label(text))

    true_labels = test_df["target_text"].tolist()

    metrics = evaluate_interiority_model(
        y_true=true_labels,
        y_pred=decoded_preds,
        model_name=f"{model_name.split('/')[-1]} (fine-tuned)",
        model_type="Seq2Seq transformers",
    )

    return metrics


In [None]:
for size in ["small", "base"]:
    model_name = f"google/flan-t5-{size}"
    metrics = run_finetune_experiment(model_name)
    all_results.append(metrics)



Fine-tuning google/flan-t5-small


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


Accuracy: 0.4
Macro F1: 0.19047619047619047

Detailed classification report:
              precision    recall  f1-score   support

        none      0.400     1.000     0.571        48
         low      0.000     0.000     0.000        31
        high      0.000     0.000     0.000        41

    accuracy                          0.400       120
   macro avg      0.133     0.333     0.190       120
weighted avg      0.160     0.400     0.229       120


Fine-tuning google/flan-t5-base


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/477 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


Accuracy: 0.4166666666666667
Macro F1: 0.3637101181936189

Detailed classification report:
              precision    recall  f1-score   support

        none      0.437     0.646     0.521        48
         low      0.400     0.129     0.195        31
        high      0.385     0.366     0.375        41

    accuracy                          0.417       120
   macro avg      0.407     0.380     0.364       120
weighted avg      0.409     0.417     0.387       120



FLAN-T5 WASN'T USED DUE TO STORAGE ISSUES.

# Zero-Shot and Few-Shot Flan-T5

Here we evaluate **google/flan-t5-small/base/large** as a *prompted* model
for our interiority classification task (none / low / high).

We run two settings:

1. **Zero-shot (v2 prompt)**   
2. **Few-shot (v3 prompt)**  

We use the same gold interiority dataset.


In [None]:
# Use the full dataset for prompting (no train/test split here)
texts = df["text"].tolist()
y_true = df["label"].tolist()

print("Number of passages:", len(texts))
print("Label distribution:\n", df["label"].value_counts())


Number of passages: 597
Label distribution:
 label
none    237
high    204
low     156
Name: count, dtype: int64


## Prompt designs

We design two prompts:

- **v2 (zero-shot)** – definition of interiority and descriptions of labels.  
- **v3 (few-shot)** – same as v2 plus a few example passages with gold labels
  before the target passage.


In [None]:
def build_v2_prompt(passage: str) -> str:
    """
    Zero-shot prompt (v2): definition + label descriptions.
    """
    return f"""
You are a literary scholar analyzing narrative interiority.

Interiority is when the text gives access to a character's inner thoughts,
feelings, perceptions, or mental states.

Classify the level of interiority in the passage below as one of:
- none: no clear access to inner thoughts or feelings.
- low: some interiority, but brief or limited.
- high: sustained or rich access to inner thoughts, feelings, or perceptions.

Passage:
\"\"\"{passage}\"\"\"

Answer with a single word: none, low, or high.

Label:
""".strip()


def build_v3_prompt(passage: str) -> str:
    """
    Few-shot prompt (v3): definition + a few examples + label descriptions.
    You can replace the toy examples with real examples from your dataset.
    """
    return f"""
You are a literary scholar analyzing narrative interiority.

Interiority is when the text gives access to a character's inner thoughts,
feelings, perceptions, or mental states.

Here are some examples:

Example 1 (none):
"There was a knock at the door. Mary stood up and opened it."

Example 2 (low):
"Mary hesitated for a moment, wondering if she should speak."

Example 3 (high):
"Mary's mind raced back to the night of the accident. She felt the same cold
fear rising in her chest as she replayed every word she had said, every
step she had taken."

Now classify the level of interiority in the passage below as one of:
- none: no clear access to inner thoughts or feelings.
- low: some interiority, but brief or limited.
- high: sustained or rich access to inner thoughts, feelings, or perceptions.

Passage:
\"\"\"{passage}\"\"\"

Answer with a single word: none, low, or high.

Label:
""".strip()


## Helper functions

We define:

- `normalize_label` – maps raw generated text to `none`, `low`, or `high`.
- `predict_labels_flant5` – runs Flan-T5-Large on a list of passages with a
  given prompt builder.
- `evaluate_interiority_model` – computes the metrics we use in our results table.


In [None]:
def normalize_label_prompt(text: str) -> str:
    """
    Map raw generated text to one of: none, low, high.
    """
    text = text.strip().lower()
    if "high" in text:
        return "high"
    if "low" in text:
        return "low"
    if "none" in text or "no interiority" in text:
        return "none"
    return "none"  # fallback for weird outputs


def predict_labels_flant5(passages, prompt_builder, tokenizer, model, max_length=10):
    """
    Run a Flan-T5 model on a list of passages using the given prompt builder.
    Returns a list of normalized labels.
    """
    preds = []

    for i, p in enumerate(passages):
        prompt = prompt_builder(p)

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            padding="longest",
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
            )

        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        label = normalize_label_prompt(raw)
        preds.append(label)

        # Show a few examples to sanity-check behavior
        if i < 3:
            print(f"--- Example {i+1} ---")
            print("Raw output:", raw)
            print("Normalized:", label)
            print()

    return preds


In [None]:
def run_prompt_experiment(model_name: str, setting_label: str, prompt_builder):
    """
    Load a Flan-T5 model (small/base/large), run zero- or few-shot prompting,
    compute metrics, and return a dict for the results table.
    """
    print("\n==============================")
    print(f"Prompted run: {model_name} – {setting_label}")
    print("==============================")

    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    model.eval()

    preds = predict_labels_flant5(texts, prompt_builder, tokenizer, model)

    metrics = evaluate_interiority_model(
        y_true=y_true,
        y_pred=preds,
        model_name=f"{model_name.split('/')[-1]} ({setting_label})",
        model_type="Seq2Seq transformers (prompted)",
    )

    return metrics


In [None]:
for size in ["small", "base", "large"]:
    model_name = f"google/flan-t5-{size}"

    # Zero-shot (v2)
    zs_metrics = run_prompt_experiment(
        model_name=model_name,
        setting_label="zero-shot (v2)",
        prompt_builder=build_v2_prompt,
    )
    all_results.append(zs_metrics)

    # Few-shot (v3)
    fs_metrics = run_prompt_experiment(
        model_name=model_name,
        setting_label="few-shot (v3)",
        prompt_builder=build_v3_prompt,
    )
    all_results.append(fs_metrics)



Prompted run: google/flan-t5-small – zero-shot (v2)
--- Example 1 ---
Raw output: none
Normalized: none

--- Example 2 ---
Raw output: none
Normalized: none

--- Example 3 ---
Raw output: high
Normalized: high

Accuracy: 0.38023450586264657
Macro F1: 0.2860185994345998

Detailed classification report:
              precision    recall  f1-score   support

        none      0.440     0.342     0.385       237
         low      0.000     0.000     0.000       156
        high      0.354     0.716     0.473       204

    accuracy                          0.380       597
   macro avg      0.265     0.352     0.286       597
weighted avg      0.296     0.380     0.314       597


Prompted run: google/flan-t5-small – few-shot (v3)
--- Example 1 ---
Raw output: high
Normalized: high

--- Example 2 ---
Raw output: high
Normalized: high

--- Example 3 ---
Raw output: high
Normalized: high

Accuracy: 0.3165829145728643
Macro F1: 0.20457841831358473

Detailed classification report:
            

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

--- Example 1 ---
Raw output: high
Normalized: high

--- Example 2 ---
Raw output: high
Normalized: high

--- Example 3 ---
Raw output: low
Normalized: low

Accuracy: 0.36515912897822445
Macro F1: 0.26099607915338435

Detailed classification report:
              precision    recall  f1-score   support

        none      0.630     0.072     0.129       237
         low      0.215     0.090     0.127       156
        high      0.370     0.917     0.528       204

    accuracy                          0.365       597
   macro avg      0.405     0.359     0.261       597
weighted avg      0.433     0.365     0.264       597


Prompted run: google/flan-t5-large – few-shot (v3)
--- Example 1 ---
Raw output: high
Normalized: high

--- Example 2 ---
Raw output: high
Normalized: high

--- Example 3 ---
Raw output: low
Normalized: low

Accuracy: 0.34505862646566166
Macro F1: 0.2420912751272909

Detailed classification report:
              precision    recall  f1-score   support

        none 

## FINAL RESULTS

In [None]:
import pandas as pd

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(by=["Type", "Model"]).reset_index(drop=True)
results_df


Unnamed: 0,Type,Model,Accuracy,Macro F1,None Precision/Recall,Low Precision/Recall,High Precision/Recall,None F1,Low F1,High F1
0,Seq2Seq transformers,flan-t5-base (fine-tuned),0.417,0.364,0.44/0.65,0.40/0.13,0.38/0.37,0.52,0.2,0.38
1,Seq2Seq transformers,flan-t5-small (fine-tuned),0.4,0.19,0.40/1.00,0.00/0.00,0.00/0.00,0.57,0.0,0.0
2,Seq2Seq transformers (prompted),flan-t5-base (few-shot (v3)),0.422,0.293,0.41/0.89,0.19/0.02,0.53/0.19,0.56,0.03,0.28
3,Seq2Seq transformers (prompted),flan-t5-base (zero-shot (v2)),0.405,0.263,0.40/0.89,0.17/0.01,0.47/0.15,0.55,0.01,0.22
4,Seq2Seq transformers (prompted),flan-t5-large (few-shot (v3)),0.345,0.242,0.55/0.05,0.19/0.10,0.36/0.88,0.09,0.13,0.51
5,Seq2Seq transformers (prompted),flan-t5-large (zero-shot (v2)),0.365,0.261,0.63/0.07,0.22/0.09,0.37/0.92,0.13,0.13,0.53
6,Seq2Seq transformers (prompted),flan-t5-small (few-shot (v3)),0.317,0.205,0.30/0.09,0.20/0.01,0.32/0.81,0.14,0.01,0.46
7,Seq2Seq transformers (prompted),flan-t5-small (zero-shot (v2)),0.38,0.286,0.44/0.34,0.00/0.00,0.35/0.72,0.38,0.0,0.47


## 6 SELECTED BOOKS

In [None]:
df["title"].value_counts()


Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
A farewell to arms,40
Dubliners,40
Martin Eden,40
My Ántonia,40
Mrs. Dalloway,40
The Age of Innocence,40
Swann's Way,40
The Picture of Dorian Gray,40
The Great Gatsby,40
The Dunwich horror,40


In [None]:
# ============================================
# BOOK-LEVEL PROMPTING: ZERO-SHOT & FEW-SHOT
# Test set = 6 books
# ============================================

print("\n================================")
print("BOOK-LEVEL PROMPTING (6-BOOK TEST SET)")
print("================================")

selected_books_prompt = [
    "Dubliners",
    "The Picture of Dorian Gray",
    "My Ántonia",
    "The murder of Roger Ackroyd",
    "A Farewell to Arms",
    "The Garden Party"
]

# Subset df to only these 6 books
df_6books = df[df["title"].isin(selected_books_prompt)].copy()
texts_6 = df_6books["text"].tolist()
y_true_6 = df_6books["label"].tolist()

print("Number of passages in 6-book test set:", len(texts_6))
print("Books included:", df_6books["title"].unique())

# run prompted experiment on an arbitrary subset
def run_prompt_experiment_subset(
    model_name: str,
    setting_label: str,
    prompt_builder,
    texts_subset,
    y_true_subset
):
    """
    Load a Flan-T5 model (small/base/large),
    run zero- or few-shot prompting on a given subset of passages,
    compute metrics, and return a dict for the results table.
    """
    print("\n==============================")
    print(f"Prompted run: {model_name} – {setting_label} (6-book test)")
    print("==============================")

    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
    model.eval()

    preds = predict_labels_flant5(texts_subset, prompt_builder, tokenizer, model)

    metrics = evaluate_interiority_model(
        y_true=y_true_subset,
        y_pred=preds,
        model_name=f"{model_name.split('/')[-1]} ({setting_label}, 6-book test)",
        model_type="Seq2Seq transformers (prompted, book-generalization)",
    )
    return metrics


# zero-shot (v2) and few-shot (v3) for small, base, large
book_prompt_results = []

for size in ["small", "base", "large"]:
    model_name = f"google/flan-t5-{size}"

    # Zero-shot (v2) on 6-book test set
    zs_metrics_6 = run_prompt_experiment_subset(
        model_name=model_name,
        setting_label="zero-shot (v2)",
        prompt_builder=build_v2_prompt,
        texts_subset=texts_6,
        y_true_subset=y_true_6,
    )
    book_prompt_results.append(zs_metrics_6)
    all_results.append(zs_metrics_6)  # include in global results table

    # Few-shot (v3) on 6-book test set
    fs_metrics_6 = run_prompt_experiment_subset(
        model_name=model_name,
        setting_label="few-shot (v3)",
        prompt_builder=build_v3_prompt,
        texts_subset=texts_6,
        y_true_subset=y_true_6,
    )
    book_prompt_results.append(fs_metrics_6)
    all_results.append(fs_metrics_6)  # include in global results table

book_prompt_results



BOOK-LEVEL PROMPTING (6-BOOK TEST SET)
Number of passages in 6-book test set: 160
Books included: ['Dubliners' 'My Ántonia' 'The Picture of Dorian Gray'
 'The murder of Roger Ackroyd']

Prompted run: google/flan-t5-small – zero-shot (v2) (6-book test)
--- Example 1 ---
Raw output: high
Normalized: high

--- Example 2 ---
Raw output: none
Normalized: none

--- Example 3 ---
Raw output: high
Normalized: high

Accuracy: 0.3875
Macro F1: 0.29194509194509194

Detailed classification report:
              precision    recall  f1-score   support

        none      0.568     0.342     0.427        73
         low      0.000     0.000     0.000        38
        high      0.319     0.755     0.448        49

    accuracy                          0.388       160
   macro avg      0.296     0.366     0.292       160
weighted avg      0.357     0.388     0.332       160


Prompted run: google/flan-t5-small – few-shot (v3) (6-book test)
--- Example 1 ---
Raw output: high
Normalized: high

--- Exam

[{'Type': 'Seq2Seq transformers (prompted, book-generalization)',
  'Model': 'flan-t5-small (zero-shot (v2), 6-book test)',
  'Accuracy': 0.388,
  'Macro F1': 0.292,
  'None Precision/Recall': '0.57/0.34',
  'Low Precision/Recall': '0.00/0.00',
  'High Precision/Recall': '0.32/0.76',
  'None F1': np.float64(0.43),
  'Low F1': np.float64(0.0),
  'High F1': np.float64(0.45)},
 {'Type': 'Seq2Seq transformers (prompted, book-generalization)',
  'Model': 'flan-t5-small (few-shot (v3), 6-book test)',
  'Accuracy': 0.3,
  'Macro F1': 0.185,
  'None Precision/Recall': '0.42/0.07',
  'Low Precision/Recall': '0.00/0.00',
  'High Precision/Recall': '0.29/0.88',
  'None F1': np.float64(0.12),
  'Low F1': np.float64(0.0),
  'High F1': np.float64(0.44)},
 {'Type': 'Seq2Seq transformers (prompted, book-generalization)',
  'Model': 'flan-t5-base (zero-shot (v2), 6-book test)',
  'Accuracy': 0.456,
  'Macro F1': 0.282,
  'None Precision/Recall': '0.45/0.89',
  'Low Precision/Recall': '0.00/0.00',
  'Hi