In [1]:
from google.colab import files
import json
import os
import re
import numpy as np
import calendar

In [2]:
uploaded = files.upload()

Saving predictions_BATCHED_original_offset_iso_date.jsonl to predictions_BATCHED_original_offset_iso_date.jsonl
Saving predictions_BATCHED_original_offset_original_date.jsonl to predictions_BATCHED_original_offset_original_date.jsonl
Saving predictions_BATCHED_total_months_numeric_offset_ordinal_month_date.jsonl to predictions_BATCHED_total_months_numeric_offset_ordinal_month_date.jsonl
Saving predictions_BATCHED_total_months_word_offset_full_words_date.jsonl to predictions_BATCHED_total_months_word_offset_full_words_date.jsonl


In [3]:
import numpy as np
import calendar

answer_column_name = 'llama_prediction'
results_output_dir = "/content/results"
os.makedirs(results_output_dir, exist_ok=True)

# Build a lookup from month names / abbreviations → zero‑padded month number
_month_lookup = {}
for month_idx in range(1, 13):
    month_num_str = f"{month_idx:02d}"
    month_forms = [
        calendar.month_name[month_idx].lower(),
        calendar.month_abbr[month_idx].lower().rstrip('.')
    ]
    for form in month_forms:
      for prefix_len in range(3, len(form) + 1):
            _month_lookup[form[:prefix_len]] = month_num_str

_year_re = re.compile(r"(\d{4})")

def _normalize_text(txt: str) -> str:
    """
    Canonicalise various date strings to ISO 'YYYY-MM' where possible,
    otherwise fallback to lowercased / whitespace-collapsed text.

    Examples:
        "Mar, 1789"   -> "1789-03"
        "march 1789"  -> "1789-03"
        "1789-03-12"  -> "1789-03"
        "1789-03"     -> "1789-03"
    """
    if not txt or not isinstance(txt, str):
        return ""

    s = " ".join(txt.strip().lower().split()).replace("*", "")

    # 1) ISO patterns: YYYY-MM or YYYY-MM-DD
    m_iso = re.match(r"^(?P<year>\d{4})-(?P<month>\d{2})(?:-\d{2})?$", s)
    if m_iso:
        return f"{m_iso.group('year')}-{m_iso.group('month')}"

    # 2) Month name patterns
    month_pattern = "|".join(re.escape(month) for month in _month_lookup.keys())

    # Pattern: YYYY month_name [YYYY]?
    pattern = rf"^(?P<year1>\d{{4}})\s+(?P<month_name>{month_pattern})[\.,]?\s*(?P<year2>\d{{4}})?$"
    m_name = re.match(pattern, s)
    if m_name:
        month_str = m_name.group("month_name")
        year_str = m_name.group("year1")
        month_num = _month_lookup.get(month_str)
        if month_num:
            return f"{year_str}-{month_num}"

    # Pattern: month_name YYYY
    m_month_year = re.match(rf"^(?P<month_name>{month_pattern})[\.,]?\s+(?P<year>\d{{4}})$", s)
    if m_month_year:
        month_num = _month_lookup.get(m_month_year.group("month_name"))
        if month_num:
            return f"{m_month_year.group('year')}-{month_num}"

    # 3) If no conversion matched, return the cleaned text
    return s

# Extract the first year found in the text
def _extract_year(txt: str):
    m = _year_re.search(txt)
    return int(m.group(1)) if m else None


def _reference_year(question: str):
    """
    Extract the YYYY that appears *last* in the question –
    this is the base date in all L1 questions like '... after Jul, 1699'.
    """
    years = _year_re.findall(question)
    return int(years[-1]) if years else None


def evaluate_predictions(results_for_this_dataset, answer_column_name, dataset_filename, results_output_dir):

    g_em, g_abs_err, g_trend_ok, count_year = 0, 0, 0, 0
    total_examples = len(results_for_this_dataset)

    for item in results_for_this_dataset:
        # Extract gold and predicted answers
        gold = (item["text_answers"]["text"][0]
                if isinstance(item["text_answers"], dict)
                else item["text_answers"])
        pred = item.get(answer_column_name, "")
        question = item["original_question"]

        # Exact Match
        if _normalize_text(gold) in _normalize_text(pred):
            g_em += 1
        #else:
           # print(_normalize_text(pred), _normalize_text(gold))

        # Year-based metrics
        year_gold = _extract_year(gold)
        year_pred = _extract_year(pred)
        year_ref = _reference_year(question)

        if year_gold is not None and year_pred is not None:
            g_abs_err += abs(year_pred - year_gold)

            # Trend: sign wrt reference year
            if year_ref is not None:
                gold_sign = np.sign(year_gold - year_ref)
                pred_sign = np.sign(year_pred - year_ref)
                if gold_sign == pred_sign and gold_sign != 0:
                    g_trend_ok += 1
            count_year += 1

    exact_match = g_em / total_examples if total_examples else 0.0
    mae = g_abs_err / count_year if count_year else 0.0
    trend_accuracy = g_trend_ok / count_year if count_year else 0.0

    print(f"=== Evaluation for {dataset_filename} ===")
    print(f"  Exact Match        : {exact_match:.4f}")
    print(f"  Mean Absolute Error: {mae:.4f}")
    print(f"  Trend Accuracy     : {trend_accuracy:.4f}")

    # Save metrics to JSON file
    metrics_path = os.path.join(results_output_dir,
                                f"metrics_{dataset_filename.replace('.jsonl', '.json')}")
    metrics = {
        "dataset": dataset_filename,
        "num_examples": total_examples,
        "exact_match": exact_match,
        "mae_year": mae,
        "trend_accuracy": trend_accuracy,
    }

    with open(metrics_path, 'w', encoding='utf-8') as mf:
        json.dump(metrics, mf, indent=2)

    return metrics

In [4]:
for filename in uploaded.keys():
    print(f"\nProcessing {filename}...")

    results_for_this_dataset = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            results_for_this_dataset.append(json.loads(line.strip()))

    metrics = evaluate_predictions(
        results_for_this_dataset,
        answer_column_name,
        filename,
        results_output_dir
    )
    print(f"\nReturned metrics: {metrics}")


Processing predictions_BATCHED_original_offset_iso_date.jsonl...
=== Evaluation for predictions_BATCHED_original_offset_iso_date.jsonl ===
  Exact Match        : 0.3355
  Mean Absolute Error: 21.1935
  Trend Accuracy     : 0.9080

Returned metrics: {'dataset': 'predictions_BATCHED_original_offset_iso_date.jsonl', 'num_examples': 4000, 'exact_match': 0.3355, 'mae_year': 21.193531422561136, 'trend_accuracy': 0.9079673941625033}

Processing predictions_BATCHED_original_offset_original_date.jsonl...
=== Evaluation for predictions_BATCHED_original_offset_original_date.jsonl ===
  Exact Match        : 0.3155
  Mean Absolute Error: 23.8682
  Trend Accuracy     : 0.9089

Returned metrics: {'dataset': 'predictions_BATCHED_original_offset_original_date.jsonl', 'num_examples': 4000, 'exact_match': 0.3155, 'mae_year': 23.868249412992434, 'trend_accuracy': 0.9089486042264545}

Processing predictions_BATCHED_total_months_numeric_offset_ordinal_month_date.jsonl...
=== Evaluation for predictions_BATC