## **Evaluation Pipeline (Qwen2-Audio Baseline & CoT-Finetuned Model)**

This notebook provides the evaluation workflow for comparing the original Qwen2-Audio-7B-Instruct model with our CoT fine-tuned version.
Before running the pipeline, ensure that the model checkpoint(`MODEL_ID` = `"Qwen/Qwen2-Audio-7B-Instruct"`)has been downloaded locally.
We evaluate both models end-to-end on `TEARS_V2`, using a unified pipeline that handles inference, label extraction, reasoning parsing, and dataset-level scoring.

**1. Preparing the Environment**
*   Make sure the checkpoint ZIP is already uploaded to Colab.
*   The dataset ZIP must also be pre-uploaded.


**2. Core Functions for Inference & Label Extraction**
*   `predict_one()`: Runs inference for a single audio file.
*   `extract_label()`: Parses the model's output text to obtain the predicted label. If the text contains "most consistent with:", extract everything after it. Otherwise, parse the last line and take the substring after the last "is".
*   `eval_full_dataset()`: Outputs a .jsonl file where each line is one sample.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ============================================================
# Install required packages
# ============================================================
!pip install -q transformers accelerate huggingface_hub

unzip kook's model

In [None]:
# ============================================================
# Unzip Model to Colab Local Disk
# ============================================================

ZIP_FILE = "/content/drive/MyDrive/QwenFinetune/kook-checkpoint-9500.zip"
TARGET_DIR = "/content/kook"  # Extract to Colab local disk

print("Unzipping to Colab local disk (faster)...")
print(f"Source: {ZIP_FILE}")
print(f"Target: {TARGET_DIR}")

!unzip -q "{ZIP_FILE}" -d "{TARGET_DIR}"

print("\n✓ Unzip completed!")
print(f"Checkpoint is now at: {TARGET_DIR}")

Unzip dataset

In [None]:
# ============================================================
# Unzip TEARS_V2 Dataset to Colab Local Disk
# ============================================================

ZIP_FILE = "/content/drive/MyDrive/Tears_v2.zip"
TARGET_DIR = "/content/TEARS_V2"  # Extract to Colab local disk

print("Unzipping to Colab local disk (faster)...")
print(f"Source: {ZIP_FILE}")
print(f"Target: {TARGET_DIR}")
print("\nThis will take approximately 5-10 minutes for 6GB...")

!unzip -q "{ZIP_FILE}" -d "{TARGET_DIR}"

print("\n✓ Unzip completed!")
print(f"Dataset is now at: {TARGET_DIR}")

Update package

In [None]:
!pip install torchcodec

In [None]:
!pip install -q "transformers>=4.45.0" "datasets" "peft" "accelerate" \
               "soundfile" "librosa" wandb

In [None]:
# Uninstall current version and install a working version
!pip uninstall transformers -y
!pip install transformers==4.57.1

filter test data if needed

In [None]:
# import json

# in_path = "/content/TEARS_V2/TEARS_V2/test_v3.json"
# out_path = "/content/drive/MyDrive/test_only_v3.json" # /content/drive/MyDrive

# with open(in_path, "r") as f:
#     data = json.load(f)

# test_data = [
#     item for item in data
#     if "/test/" in item.get("audio_path", "").lower()
# ]

# print(f"Total samples: {len(data)}, test samples: {len(test_data)}")

# with open(out_path, "w") as f:
#     json.dump(test_data, f, indent=2, ensure_ascii=False)

Core Functions

In [None]:
# ============================================================
# Import necessary libraries
# ============================================================
from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
from huggingface_hub import login
import torch
from pathlib import Path
import getpass

In [None]:
import os
import json
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from datasets import load_dataset
from transformers import (
    Qwen2AudioForConditionalGeneration,
    AutoProcessor,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
from pathlib import Path
import json
from transformers import AutoProcessor
import numpy as np

In [None]:
# ==========================
# Configuration
# ==========================
from peft import PeftModel
DATA_ROOT = Path("/content/drive/MyDrive")
TEST_FILE = str(DATA_ROOT / "test_only_v3.json")

In [None]:
import torch
from torch.utils.data import Dataset
import json
import librosa
from pathlib import Path
from typing import Dict, Any
import random

Evaluation

In [None]:
# import librosa
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def predict_dialect(audio_path: str, prompt: str, max_new_tokens: int = 64):
#     audio, sr = librosa.load(audio_path, sr=16000)

#     messages = [
#         {
#             "role": "user",
#             "content": [
#                 {"type": "audio", "audio": audio, "sampling_rate": sr},
#                 {"type": "text", "text": prompt},
#             ],
#         }
#     ]

#     text = processor.apply_chat_template(
#         messages,
#         add_generation_prompt=True,
#         tokenize=False,
#     )

#     inputs = processor(
#         text=[text],
#         audios=[audio],
#         sampling_rate=sr,
#         return_tensors="pt",
#         padding=True,
#     ).to(device)

#     # output
#     with torch.no_grad():
#         generated_ids = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=False,
#             temperature=0.0,
#         )

#     # decode
#     output_text = processor.batch_decode(
#         generated_ids[:, inputs["input_ids"].shape[-1]:],
#         skip_special_tokens=True,
#     )[0]

#     return output_text


In [None]:
import librosa
import torch

def predict_one(audio_path, prompt, max_new_tokens=96):
    audio, sr = librosa.load(audio_path, sr=16000)

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "audio", "audio": audio, "sampling_rate": sr},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )

    inputs = processor(
        text=[text],
        audios=[audio],
        sampling_rate=sr,
        return_tensors="pt",
        padding=True,
    ).to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
        )

    new_tokens = gen_ids[:, inputs["input_ids"].shape[-1]:]
    output = processor.batch_decode(
        new_tokens,
        skip_special_tokens=True,
    )[0]

    return output.strip()


In [None]:
import json

TEST_JSON = TEST_FILE

with open(TEST_JSON, "r") as f:
    test_data = json.load(f)


In [None]:
import re
import random
import json
from pathlib import Path

random.seed(11785)
AUDIO_ROOT = Path("/content/TEARS_V2/TEARS_V2")

def extract_label(text: str) -> str:
    """
    extract label from generated sentences
    if contains 'most consistent with:'，else use the last line after 'is'
    """
    t = text.strip()
    lower = t.lower()

    if "most consistent with:" in lower:
        idx = lower.rfind("most consistent with:")
        label_part = t[idx + len("most consistent with:") :]
    else:
        last_line = t.splitlines()[-1].strip()
        last_lower = last_line.lower()

        is_idx = last_lower.rfind(" is ")
        if is_idx != -1:
            label_part = last_line[is_idx + len(" is ") :]
        else:
            label_part = last_line

    label = label_part.strip().strip(".")
    return label.lower()

In [None]:
from tqdm import tqdm

def analyze_reasoning(gt_reasoning: str, pred_text: str):
    """
    record questionable points in reasoning by string match
    """
    issues = []
    gt = gt_reasoning.lower()
    pred = pred_text.lower()

    # 1) apparently opposite acoustic description
    opposite_pairs = [
        ("high rhoticity", "low rhoticity"),
        ("low rhoticity", "high rhoticity"),
        ("high pitch", "low pitch"),
        ("low pitch", "high pitch"),
        ("high vowel reduction", "low vowel reduction"),
        ("low vowel reduction", "high vowel reduction"),
    ]
    for gt_phrase, pred_phrase in opposite_pairs:
        if gt_phrase in gt and pred_phrase in pred:
            issues.append(f"GT has '{gt_phrase}' but prediction says '{pred_phrase}'")

    # 2) Obvious social attribute hallucination
    if "biracial" in pred:
        issues.append("Mentions 'biracial' (not annotated in ground-truth labels).")
    if "jianghuai mandarin" in pred or "hefei dialect" in pred:
        issues.append("Mentions non-existent dialect label (e.g., Jianghuai Mandarin).")

    return issues


def eval_full_dataset(samples, out_path: str = "eval_full.jsonl", verbose: bool = False):
    n = len(samples)
    # picked = random.sample(samples, k)
    correct = 0
    out_f = open(out_path, "w", encoding="utf-8")

    for i, ex in enumerate(tqdm(samples, desc="Eval full dataset")):
        audio = AUDIO_ROOT / ex["audio_path"]
        prompt = ex["prompt"]
        gt_label = ex["answer"].strip().lower()

        pred = predict_one(audio, prompt)
        pred_label = extract_label(pred)

        is_correct = (pred_label == gt_label)
        correct += int(is_correct)

        # reasoning_issues = analyze_reasoning(ex["response"], pred)

        record = {
                "idx": i,
                "audio_path": str(audio),
                "question": prompt,
                "gt_label": gt_label,
                "pred_label": pred_label,
                "correct": bool(is_correct),
                "gt_reasoning": ex["response"],
                "pred_text": pred,
                # "reasoning_issues": reasoning_issues,
            }
        out_f.write(json.dumps(record, ensure_ascii=False) + "\n")

        if verbose:
            print(f"\n=== Sample {i} ===")
            print(f"Audio: {audio}")
            print(f"Question: {prompt}")
            print(f"GT label: {gt_label}")
            print(f"Pred label: {pred_label}")
            print(f"Full GT: {ex['response']}")
            print(f"Generated: {pred}")
            print(f"Correct: {is_correct}")
            # if reasoning_issues:
            #     print(f"Reasoning issues: {reasoning_issues}")

    out_f.close()
    acc = correct / n if n > 0 else 0.0
    print(f"\nFull-dataset Accuracy ({n} samples): {acc:.4f}")

    return acc

Evaluate original Qwen

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
BASE_MODEL = "/content/drive/MyDrive/QwenFinetune/Qwen2-Audio-7B-Instruct"
ADAPTER_DIR = "/content/kook" # TODO: test kook-checkpoint-9500

print(f"Loading model and processor from: {BASE_MODEL}")

processor = AutoProcessor.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
)

print("Load original model...")
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

In [None]:
eval_full_dataset(test_data, "/content/drive/MyDrive/eval_full.jsonl", verbose=True)

Evaluate finetuned model

In [None]:
BASE_MODEL = "/content/drive/MyDrive/QwenFinetune/Qwen2-Audio-7B-Instruct"
ADAPTER_DIR = "/content/kook" # TODO: test kook-checkpoint-9500

print(f"Loading model and processor from: {BASE_MODEL}")

processor = AutoProcessor.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
)

print("Load base model...")
base_model = Qwen2AudioForConditionalGeneration.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

print(f"Load LoRA adapter from: {ADAPTER_DIR}")
model = PeftModel.from_pretrained(
    base_model,
    ADAPTER_DIR,
    is_trainable=True,
)

print("✓ Model and processor loaded successfully")

In [None]:
eval_full_dataset(test_data, "/content/drive/MyDrive/eval_full_finetuned.jsonl", verbose=True)

In [None]:
# audio_path = "/content/TEARS_V2/TEARS_V2/ears_dataset/val/p101/freeform_speech_03_640000_800000.wav"
# prompt = "What is the speaker's ethnicity?"

# print(predict_dialect(audio_path, prompt)) # using current model

# # {
# #     "audio_path": "ears_dataset/val/p101/freeform_speech_03_640000_800000.wav",
# #     "prompt": "What is the speaker's ethnicity?",
# #     "response": "Because of the combination of high rhoticity_ratio, presence of syllabic_consonants, and vowel articulation patterns consistent with Southern American English, the speaker is most consistent with: black or african american",
# #     "answer": "black or african american"
# # }