# clean_results

load all raw converted TSV files, fix known extraction/ground-truth bugs, and save a single clean parquet for analysis.

**bugs fixed here:**
1. **timezone ground truth** — `preprocessing.py` used `int()` instead of `round()` when formatting minutes, making most timezone correct answers off by 1 minute. we recompute them from the raw inputs.
2. **timezone math_only model_answer** — the math_only prompt is pure arithmetic (`(1+3.0)%24`), so models respond with a decimal number instead of a time string. `extract_time_string` returns None. we interpret those numbers as 24-hour decimal time.
3. **bra_size model_answer** — `extract_clothing_size` regex can't handle double-letter cup sizes (`32AA`) or single-digit Italian band sizes (`1A`). we expand the regex.

In [2]:
import pandas as pd
import numpy as np
import json, re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## 1. load all raw results

In [3]:
BASE_DIR = Path('full_results')

CONDITIONS = {
    'regular':   'results',
    'no_guide':  'results_no_guide',
    'math_only': 'results_math_only',
}
SUBDIR_TO_CONDITION = {v: k for k, v in CONDITIONS.items()}

NON_REASONING = ['gpt-4o', 'qwen-coder', 'llama-4']
REASONING = ['gpt-5.2', 'deepseek-v3.1', 'qwen3-235b-thinking', 'qwen3-next-thinking']

def parse_model(f):
    return f.parts[2]

def parse_cond(f):
    return SUBDIR_TO_CONDITION.get(f.parts[1], f.parts[1])

def parse_domain(f):
    return (f.parts[3]
            .replace('_converted', '')
            .replace('_math_only', '')
            .replace('_no_guide', '')
            .replace('.tsv', ''))

# load every TSV and tag with model/condition/domain
files = sorted(BASE_DIR.glob('*/*/*_converted.tsv'))
frames = []
for f in files:
    try:
        df = pd.read_csv(f, sep='\t')
        df = df.assign(model=parse_model(f), condition=parse_cond(f), domain=parse_domain(f))
        frames.append(df)
    except Exception as e:
        print(f"⚠ Could not read {f}: {e}")

df_all = pd.concat(frames, ignore_index=True)
print(f"loaded {len(df_all):,} rows from {len(files)} files")
print(f"columns: {df_all.columns.tolist()}")

loaded 2,748,418 rows from 279 files
columns: ['domain', 'distractor', 'prompt', 'number', 'answer', 'difficulty', 'raw_response', 'model_answer', 'loss', 'reasoning_tokens', 'call_seconds', 'model', 'condition']


In [4]:
# filter out invalid responses
_before = len(df_all)
df_all = df_all[
    df_all['raw_response'].notna()
    & ~df_all['raw_response'].isin(['null', 'nan'])
    & ~df_all['raw_response'].astype(str).str.startswith('ERROR:')
    & ~df_all['raw_response'].astype(str).str.contains('model_not_available', na=False)
]
print(f"filtered {_before - len(df_all):,} invalid rows → {len(df_all):,} remaining")

# add helper columns
df_all['is_reasoning'] = df_all['model'].isin(REASONING)

# snapshot of NaN model_answer before any fixes
_nan_before = df_all['model_answer'].isna().sum()
print(f"NaN model_answer before fixes: {_nan_before:,} ({_nan_before/len(df_all)*100:.2f}%)")

filtered 652,650 invalid rows → 2,095,768 remaining
NaN model_answer before fixes: 382 (0.02%)


## 2. fix timezone ground truth (all conditions)

the `format_time_string` function in `preprocessing.py` used `int((hours % 1) * 60)` which truncates instead of rounding, making most generated correct answers off by 1 minute. recompute from raw inputs using `round()`.

In [5]:
# ── time parsing / formatting helpers ────────────────────────────

def parse_time(s):
    """Parse '7:13PM' or '1AM' → hours as float (24h)."""
    s = s.strip().upper()
    m = re.match(r'(\d{1,2}):(\d{2})\s*(AM|PM)', s)
    if m:
        h, mi, p = int(m.group(1)), int(m.group(2)), m.group(3)
        if p == 'PM' and h != 12: h += 12
        elif p == 'AM' and h == 12: h = 0
        return h + mi / 60.0
    m = re.match(r'(\d{1,2})\s*(AM|PM)', s)
    if m:
        h, p = int(m.group(1)), m.group(2)
        if p == 'PM' and h != 12: h += 12
        elif p == 'AM' and h == 12: h = 0
        return float(h)
    return None

def format_time(hours):
    """24h decimal hours → '7:13PM'. uses round() to avoid truncation."""
    hours = hours % 24
    total_min = round(hours * 60)
    h = (total_min // 60) % 24
    m = total_min % 60
    if m == 0:
        if h == 0:     return "12AM"
        elif h < 12:   return f"{h}AM"
        elif h == 12:  return "12PM"
        else:          return f"{h-12}PM"
    else:
        if h == 0:     return f"12:{m:02d}AM"
        elif h < 12:   return f"{h}:{m:02d}AM"
        elif h == 12:  return f"12:{m:02d}PM"
        else:          return f"{h-12}:{m:02d}PM"

# ── load timezone config ─────────────────────────────────────────

with open('conversions/timezone.json') as f:
    tz_config = json.load(f)
tz_offsets = tz_config['timezone_offsets']
city_to_tz = tz_config['city_to_timezone']

def convert_tz(time_str, from_city, to_city):
    """Recompute correct timezone answer from raw inputs."""
    hours = parse_time(time_str)
    if hours is None:
        return None
    from_off = tz_offsets.get(city_to_tz.get(from_city, from_city), 0)
    to_off   = tz_offsets.get(city_to_tz.get(to_city,   to_city),   0)
    return format_time((hours - from_off + to_off) % 24)

# ── recompute correct answers for all timezone rows ──────────────

tz_mask = df_all['domain'] == 'timezone'
old_answers = df_all.loc[tz_mask, 'answer'].copy()
prompts     = df_all.loc[tz_mask, 'prompt'].astype(str)
input_times = df_all.loc[tz_mask, 'number'].astype(str)

# prompt format: "Convert {time} in {from_city} time to {to_city} time..."
parsed = prompts.str.extract(r'Convert .+? in (.+?) time to (.+?) time')

new_answers = [
    convert_tz(t, fc, tc) if pd.notna(fc) else old
    for t, fc, tc, old in zip(input_times, parsed[0], parsed[1], old_answers)
]
df_all.loc[tz_mask, 'answer'] = new_answers

changed = (old_answers.values != np.array(new_answers))
print(f"Timezone rows: {tz_mask.sum():,}")
print(f"Ground truth answers changed: {changed.sum():,} ({changed.mean()*100:.1f}%)")

# show a few examples
idx = old_answers.index[changed][:5]
print(pd.DataFrame({
    'input': input_times.loc[idx].values,
    'old_answer': old_answers.loc[idx].values,
    'new_answer': [new_answers[list(old_answers.index).index(i)] for i in idx],
}).to_string(index=False))

Timezone rows: 10,800
Ground truth answers changed: 0 (0.0%)
Empty DataFrame
Columns: [input, old_answer, new_answer]
Index: []


## 3. fix timezone math_only model_answer extraction

the math_only prompt is pure arithmetic (`(1+3.0)%24`) so models reply with decimal hours. `extract_time_string` needs AM/PM and returns None. re-extract by interpreting the numeric response as 24h decimal time.

In [6]:
def extract_tz_answer(raw_response, is_reasoning):
    """Extract a timezone answer from a raw response.
    
    Priority:
    1. If response has a proper time string (H:MMAM/PM or HAM/PM), use it.
    2. Otherwise extract a number (from <answer> tags if reasoning model,
       else from the raw text) and interpret as 24h decimal time.
    """
    resp = str(raw_response)

    # for reasoning models, prefer <answer> tag content
    source = resp
    if is_reasoning:
        tags = re.findall(r'<answer>\s*(.*?)\s*</answer>', resp, re.DOTALL)
        if tags:
            source = tags[-1].strip()

    # try proper time string first
    m = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', source, re.IGNORECASE)
    if m and 1 <= int(m.group(1)) <= 12 and 0 <= int(m.group(2)) <= 59:
        return f"{int(m.group(1))}:{int(m.group(2)):02d}{m.group(3).upper()}"
    m = re.search(r'(\d{1,2})\s*(AM|PM)', source, re.IGNORECASE)
    if m and 1 <= int(m.group(1)) <= 12:
        return f"{int(m.group(1))}{m.group(2).upper()}"

    # fall back: extract a number and interpret as decimal hours
    num_match = re.search(r'-?\d+\.?\d*', source)
    if num_match:
        try:
            hours = float(num_match.group(0)) % 24
            return format_time(hours)
        except (ValueError, OverflowError):
            pass
    return None

# apply to math_only timezone rows
math_tz = (df_all['domain'] == 'timezone') & (df_all['condition'] == 'math_only')
old_ma = df_all.loc[math_tz, 'model_answer'].copy()

df_all.loc[math_tz, 'model_answer'] = [
    extract_tz_answer(resp, is_r)
    for resp, is_r in zip(df_all.loc[math_tz, 'raw_response'],
                          df_all.loc[math_tz, 'is_reasoning'])
]

was_nan = old_ma.isna()
now_nan = df_all.loc[math_tz, 'model_answer'].isna()
recovered = was_nan & ~now_nan
print(f"Math-only timezone rows: {math_tz.sum():,}")
print(f"  model_answer was NaN: {was_nan.sum():,}")
print(f"  model_answer now NaN: {now_nan.sum():,}")
print(f"  recovered:            {recovered.sum():,}")

# examples
sample = df_all.loc[old_ma.index[recovered][:6], ['answer', 'model_answer', 'model']]
print("\nRecovered examples:")
print(sample.to_string())

Math-only timezone rows: 3,600
  model_answer was NaN: 0
  model_answer now NaN: 0
  recovered:            0

Recovered examples:
Empty DataFrame
Columns: [answer, model_answer, model]
Index: []


In [7]:
def extract_clothing_size_fixed(answer):
    """Extract a clothing size, handling AA cups and single-digit Italian bands."""
    if not answer or pd.isna(answer):
        return None
    answer = str(answer).strip()

    # bra size: number + one or more cup letters (32AA, 70A, 1B, 100D)
    m = re.search(r'\b(\d{1,3})([A-Za-z]{1,3})\b', answer)
    if m:
        return f"{m.group(1)}{m.group(2).upper()}"

    # alpha sizes (XS, S, M, L, etc.)
    m = re.search(r'\b(XS|S|M|L|XL|XXL|XXXL)\b', answer, re.IGNORECASE)
    if m:
        return m.group(1).upper()

    # numeric sizes (shoe sizes, pants)
    m = re.search(r'\b(\d{1,3}\.?\d*)\b', answer)
    if m:
        try:
            num = float(m.group(1))
            return str(int(num)) if num == int(num) else f"{num:.1f}".rstrip('0').rstrip('.')
        except ValueError:
            return m.group(1)
    return None

# quick sanity check
_tests = {'32AA': '32AA', '34AA': '34AA', '1A': '1A', '1B': '1B',
          '32B': '32B', '70A': '70A', '85C': '85C', 'XL': 'XL', '42.5': '42.5'}
for inp, expected in _tests.items():
    got = extract_clothing_size_fixed(inp)
    status = '✓' if got == expected else f'✗ (got {got})'
    print(f"  {inp:>8s} → {got:>8s}  {status}")

# apply to ALL bra_size rows (re-extract model_answer)
bra_mask = df_all['domain'].str.contains('bra_size')
old_bra_ma = df_all.loc[bra_mask, 'model_answer'].copy()

def re_extract_bra(row):
    resp = str(row['raw_response'])
    # reasoning: try <answer> tags first
    source = resp
    if row['is_reasoning']:
        tags = re.findall(r'<answer>\s*(.*?)\s*</answer>', resp, re.DOTALL)
        if tags:
            source = tags[-1].strip()
    return extract_clothing_size_fixed(source)

df_all.loc[bra_mask, 'model_answer'] = df_all.loc[bra_mask].apply(re_extract_bra, axis=1)

was_nan = old_bra_ma.isna()
now_nan = df_all.loc[bra_mask, 'model_answer'].isna()
recovered = was_nan & ~now_nan
print(f"\nBra-size rows: {bra_mask.sum():,}")
print(f"  model_answer was NaN: {was_nan.sum():,}")
print(f"  model_answer now NaN: {now_nan.sum():,}")
print(f"  recovered:            {recovered.sum():,}")

      32AA →     32AA  ✓
      34AA →     34AA  ✓
        1A →       1A  ✓
        1B →       1B  ✓
       32B →      32B  ✓
       70A →      70A  ✓
       85C →      85C  ✓
        XL →       XL  ✓
      42.5 →     42.5  ✓

Bra-size rows: 1,568
  model_answer was NaN: 5
  model_answer now NaN: 5
  recovered:            0


## 5. recompute loss for all fixed rows

In [8]:
# ── loss functions (mirroring extractors.py) ─────────────────────

def tz_loss(model_ans, correct_ans, tolerance_minutes=1.0):
    """Minutes difference, 0.0 if within tolerance."""
    try:
        m_hrs = parse_time(str(model_ans))
        c_hrs = parse_time(str(correct_ans))
        if m_hrs is None or c_hrs is None:
            return None
        diff = abs((m_hrs - c_hrs) * 60)
        if diff > 720:
            diff = 1440 - diff
        return 0.0 if diff <= tolerance_minutes + 1e-9 else round(diff, 4)
    except:
        return None

def clothing_loss(model_ans, correct_ans):
    """Binary: 0.0 if match, 1.0 if not."""
    if model_ans is None or pd.isna(model_ans):
        return None
    try:
        return 0.0 if abs(float(str(model_ans).strip()) - float(str(correct_ans).strip())) < 0.001 else 1.0
    except (ValueError, TypeError):
        return 0.0 if str(model_ans).strip().upper() == str(correct_ans).strip().upper() else 1.0

# ── recompute timezone loss (all conditions — ground truth changed) ──

print("Recomputing timezone loss (all conditions)...")
new_tz_losses = [
    tz_loss(ma, ca)
    for ma, ca in zip(df_all.loc[tz_mask, 'model_answer'], df_all.loc[tz_mask, 'answer'])
]
old_tz_correct = (pd.to_numeric(df_all.loc[tz_mask, 'loss'], errors='coerce') == 0).sum()
df_all.loc[tz_mask, 'loss'] = new_tz_losses
new_tz_correct = sum(1 for l in new_tz_losses if l == 0.0)
print(f"  timezone accuracy before: {old_tz_correct:,} / {tz_mask.sum():,} ({old_tz_correct/tz_mask.sum()*100:.1f}%)")
print(f"  timezone accuracy after:  {new_tz_correct:,} / {tz_mask.sum():,} ({new_tz_correct/tz_mask.sum()*100:.1f}%)")

# ── recompute bra_size loss ──────────────────────────────────────

print("\nRecomputing bra-size loss...")
new_bra_losses = [
    clothing_loss(ma, ca)
    for ma, ca in zip(df_all.loc[bra_mask, 'model_answer'], df_all.loc[bra_mask, 'answer'])
]
old_bra_correct = (pd.to_numeric(df_all.loc[bra_mask, 'loss'], errors='coerce') == 0).sum()
df_all.loc[bra_mask, 'loss'] = new_bra_losses
new_bra_correct = sum(1 for l in new_bra_losses if l == 0.0)
print(f"  bra-size accuracy before: {old_bra_correct:,} / {bra_mask.sum():,} ({old_bra_correct/bra_mask.sum()*100:.1f}%)")
print(f"  bra-size accuracy after:  {new_bra_correct:,} / {bra_mask.sum():,} ({new_bra_correct/bra_mask.sum()*100:.1f}%)")

Recomputing timezone loss (all conditions)...
  timezone accuracy before: 7,366 / 10,800 (68.2%)
  timezone accuracy after:  7,366 / 10,800 (68.2%)

Recomputing bra-size loss...
  bra-size accuracy before: 1,003 / 1,568 (64.0%)
  bra-size accuracy after:  1,003 / 1,568 (64.0%)


## 6. summary & save

In [9]:
# final NaN check
nan_after = df_all['model_answer'].isna().sum()
print(f"NaN model_answer: {_nan_before:,} → {nan_after:,}  (recovered {_nan_before - nan_after:,})")
print(f"Remaining NaN breakdown:")
nan_rows = df_all[df_all['model_answer'].isna()]
for domain in sorted(nan_rows['domain'].unique()):
    n = (nan_rows['domain'] == domain).sum()
    print(f"  {domain:45s}: {n:>5d}")

print(f"\nTotal clean rows: {len(df_all):,}")
print(f"Models: {sorted(df_all['model'].unique())}")
print(f"Conditions: {sorted(df_all['condition'].unique())}")
print(f"Domains: {sorted(df_all['domain'].unique())}")

NaN model_answer: 382 → 382  (recovered 0)
Remaining NaN breakdown:
  clothing_sizes_women_bra_size                :     5
  cooking                                      :    19
  currency                                     :    23
  density                                      :    13
  energy                                       :   121
  moles_to_particles                           :    25
  timezone                                     :    16
  volume                                       :   160

Total clean rows: 2,095,768


Models: ['deepseek-v3.1', 'gpt-4o', 'gpt-5.2', 'llama-4', 'qwen-coder', 'qwen3-235b-thinking', 'qwen3-next-thinking']
Conditions: ['math_only', 'no_guide', 'regular']
Domains: ['bits_bytes', 'clothing_sizes_men_pant_size', 'clothing_sizes_men_shoe_size', 'clothing_sizes_women_bra_size', 'clothing_sizes_women_pant_size', 'clothing_sizes_women_shoe_size', 'cooking', 'currency', 'density', 'energy', 'moles_to_particles', 'speed', 'temperature', 'timezone', 'volume']


In [10]:
# ── DRY RUN: preview what would change in each TSV ──────────────
# Strategy: load original file, identify its valid rows (same filter as cell 4),
# patch only the answer/model_answer/loss columns from df_all, then compare.
# This preserves ALL rows (including errors/nulls) — we only touch valid ones.

TAG_COLS = ['model', 'condition', 'domain', 'is_reasoning']
FIX_COLS = ['answer', 'model_answer', 'loss']

tsv_files = sorted(BASE_DIR.glob('*/*/*_converted.tsv'))
total_changed_rows = 0
files_with_changes = 0

def _valid_mask(df):
    """Same filter as cell 4."""
    return (
        df['raw_response'].notna()
        & ~df['raw_response'].isin(['null', 'nan'])
        & ~df['raw_response'].astype(str).str.startswith('ERROR:')
        & ~df['raw_response'].astype(str).str.contains('model_not_available', na=False)
    )

for f in tsv_files:
    cond = parse_cond(f)
    model = parse_model(f)
    domain = parse_domain(f)
    mem_mask = (
        (df_all['model'] == model)
        & (df_all['condition'] == cond)
        & (df_all['domain'] == domain)
    )
    if mem_mask.sum() == 0:
        continue

    orig = pd.read_csv(f, sep='\t')
    valid = _valid_mask(orig)

    # the K-th valid row in orig corresponds to the K-th row in df_all[mem_mask]
    fixed_vals = df_all.loc[mem_mask, FIX_COLS].reset_index(drop=True)
    assert valid.sum() == len(fixed_vals), (
        f"{f}: {valid.sum()} valid rows on disk vs {len(fixed_vals)} in df_all"
    )

    # build patched copy — only update valid rows
    patched = orig.copy()
    valid_idx = orig.index[valid]
    for col in FIX_COLS:
        patched.loc[valid_idx, col] = fixed_vals[col].values

    # compare original vs patched (all rows, same length)
    old_cmp = orig[FIX_COLS].astype(str).fillna('')
    new_cmp = patched[FIX_COLS].astype(str).fillna('')
    row_changed = (old_cmp != new_cmp).any(axis=1)
    n_changed = row_changed.sum()

    if n_changed > 0:
        total_changed_rows += n_changed
        files_with_changes += 1
        print(f"\n{'─'*60}")
        print(f"{f}  ({n_changed} rows changed out of {len(orig)})")
        idx = row_changed[row_changed].index[:5]
        diff_rows = []
        for i in idx:
            row = {}
            for col in FIX_COLS:
                o, n_ = old_cmp.at[i, col], new_cmp.at[i, col]
                if o != n_:
                    row[f'{col}_old'] = o
                    row[f'{col}_new'] = n_
            diff_rows.append(row)
        print(pd.DataFrame(diff_rows).to_string(index=False))
        if n_changed > 5:
            print(f"  ... and {n_changed - 5} more")

print(f"\n{'═'*60}")
print(f"TOTAL: {files_with_changes} files with changes, {total_changed_rows:,} rows affected")
print(f"       {len(tsv_files) - files_with_changes} files unchanged")


────────────────────────────────────────────────────────────
full_results/results_no_guide/deepseek-v3.1/timezone_no_guide_converted.tsv  (16 rows changed out of 600)
loss_old loss_new
     nan     None
     nan     None
     nan     None
     nan     None
     nan     None
  ... and 11 more

────────────────────────────────────────────────────────────
full_results/results_no_guide/llama-4/clothing_sizes_women_bra_size_no_guide_converted.tsv  (5 rows changed out of 112)
model_answer_old model_answer_new loss_old loss_new
             nan             None      nan     None
             nan             None      nan     None
             nan             None      nan     None
             nan             None      nan     None
             nan             None      nan     None

════════════════════════════════════════════════════════════
TOTAL: 2 files with changes, 21 rows affected
       277 files unchanged


In [11]:
# ── backup existing TSVs before overwriting ──────────────────────
import shutil

BACKUP_DIR = Path('full_results_backup')
tsv_files = sorted(BASE_DIR.glob('*/*/*_converted.tsv'))

backed_up = 0
for f in tsv_files:
    dest = BACKUP_DIR / f.relative_to(BASE_DIR)
    if dest.exists():
        continue  # already backed up — don't overwrite original backup
    dest.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy2(f, dest)
    backed_up += 1

if backed_up:
    print(f"✓ backed up {backed_up} new TSV files to {BACKUP_DIR}/")
else:
    print(f"✓ all {len(tsv_files)} TSV files already backed up in {BACKUP_DIR}/")

✓ all 279 TSV files already backed up in full_results_backup/


In [12]:
# ── write back to each TSV in-place ──────────────────────────────
# Load each original file, patch only the answer/model_answer/loss
# columns for valid rows, and write back ALL rows (error rows untouched).

updated = 0
for f in tsv_files:
    cond = parse_cond(f)
    model = parse_model(f)
    domain = parse_domain(f)
    mem_mask = (
        (df_all['model'] == model)
        & (df_all['condition'] == cond)
        & (df_all['domain'] == domain)
    )
    if mem_mask.sum() == 0:
        continue

    orig = pd.read_csv(f, sep='\t')
    valid = _valid_mask(orig)
    fixed_vals = df_all.loc[mem_mask, FIX_COLS].reset_index(drop=True)

    if valid.sum() != len(fixed_vals):
        print(f"⚠ SKIPPED {f}: row count mismatch ({valid.sum()} vs {len(fixed_vals)})")
        continue

    valid_idx = orig.index[valid]
    for col in FIX_COLS:
        orig.loc[valid_idx, col] = fixed_vals[col].values

    orig.to_csv(f, sep='\t', index=False)
    updated += 1

print(f"✓ updated {updated} TSV files in-place (all rows preserved)")

# ── also save a combined TSV with ALL rows (including errors/refusals) ──
# re-read the (now fixed) individual TSVs so error rows are preserved
OUT_PATH = Path('full_results/clean_results.tsv')
all_frames = []
for f in tsv_files:
    try:
        df = pd.read_csv(f, sep='\t')
        df = df.assign(model=parse_model(f), condition=parse_cond(f), domain=parse_domain(f))
        all_frames.append(df)
    except Exception as e:
        print(f"⚠ Could not read {f}: {e}")

df_combined = pd.concat(all_frames, ignore_index=True)
df_combined.to_csv(OUT_PATH, sep='\t', index=False)
print(f"✓ saved {len(df_combined):,} rows to {OUT_PATH} (includes error/refusal rows)")
print(f"  file size: {OUT_PATH.stat().st_size / 1e6:.1f} MB")

✓ updated 257 TSV files in-place (all rows preserved)
✓ saved 2,748,418 rows to full_results/clean_results.tsv (includes error/refusal rows)
  file size: 1323.6 MB


In [13]:
# verify: spot-check a few fixed rows
print("Sample timezone rows (math_only):")
print(df_all[math_tz][['number', 'answer', 'model_answer', 'loss', 'model']].head(10).to_string())
print("\nSample bra-size rows:")
print(df_all[bra_mask][['answer', 'model_answer', 'loss', 'model']].head(10).to_string())

Sample timezone rows (math_only):
        number answer model_answer   loss          model
1344428    1AM    4AM          4AM    0.0  deepseek-v3.1
1344429    2AM    5AM          5AM    0.0  deepseek-v3.1
1344430    3AM    6AM          6AM    0.0  deepseek-v3.1
1344431    4AM    7AM          7AM    0.0  deepseek-v3.1
1344432    5AM    8AM          8AM    0.0  deepseek-v3.1
1344433    6AM    9AM          9AM    0.0  deepseek-v3.1
1344434    7AM   10AM         10AM    0.0  deepseek-v3.1
1344435    8AM   11AM         11AM    0.0  deepseek-v3.1
1344436    9AM   12PM         12PM    0.0  deepseek-v3.1
1344437   10AM    1PM          1AM  720.0  deepseek-v3.1

Sample bra-size rows:
     answer model_answer loss          model
1323    70A          70A  0.0  deepseek-v3.1
1324    70B          70B  0.0  deepseek-v3.1
1325    70C          70C  0.0  deepseek-v3.1
1326    70D          70D  0.0  deepseek-v3.1
1327    75A          75A  0.0  deepseek-v3.1
1328    75B          75B  0.0  deepseek-v3.1
1