In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import torch

pd.set_option('display.max_rows', 300)

IS_SUBMISSION = False
K = 25
INTERMEDIATE_K = 200  # Retrieve top 200 before re-ranking

device_bi = 'cpu'  # Use CPU to avoid OOM
device_rm = 'cpu'  # Qwen Reward Model also on CPU

print('IS_SUBMISSION:', IS_SUBMISSION)

if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Loading data...")
df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

misconceptions = df_misconception_mapping['MisconceptionName'].astype(str).tolist()

MISCONCEPTION_GUIDANCE = ("A misconception is a commonly misunderstood concept in mathematics. "
                          "Your task: Identify the math misconception related to the chosen wrong answer.\n")

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    text = " ".join(text.split())
    return text

def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    print("Preparing input dataframe...")
    for _, row in df.iterrows():
        correct_answer = row.get("CorrectAnswer", "")
        correct_answer_text = clean_text(row.get(f"Answer{correct_answer}Text",""))
        for choice in ['A', 'B', 'C', 'D']:
            if choice == correct_answer:
                continue
            misconception_id = row.get(f'Misconception{choice}Id', -1)
            if not is_submission and misconception_id == -1:
                continue

            q_id_ans = f"{row['QuestionId']}_{choice}"
            question_text = clean_text(row.get('QuestionText',''))
            cName = clean_text(row.get('ConstructName',''))
            sName = clean_text(row.get('SubjectName',''))
            chosen_wrong = clean_text(row.get(f'Answer{choice}Text',''))
            
            # Focus on question, correct and chosen wrong answer only
            full_context = (
                f"[SUBJECT]: {sName}\n"
                f"[CONSTRUCT]: {cName}\n"
                f"[QUESTION]: {question_text}\n"
                f"[CORRECT_ANSWER]: {correct_answer_text}\n"
                f"[CHOSEN_WRONG_ANSWER]: <<{chosen_wrong}>>\n"
                f"[GUIDANCE]: {MISCONCEPTION_GUIDANCE}"
            )
            
            items.append({'QuestionId_Answer': q_id_ans, 'Text': full_context})
            target_ids.append(int(misconception_id))
    return pd.DataFrame(items), target_ids

df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

print("Loading bi-encoder model...")
bi_model_name = "sentence-transformers/all-mpnet-base-v2"
bi_model = SentenceTransformer(bi_model_name, device=device_bi)

def embed_texts_bi(texts, model, batch_size=16):
    print(f"Embedding {len(texts)} texts with bi-encoder on CPU...")
    return model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

print("Embedding misconceptions with bi-encoder...")
misconception_embeds = embed_texts_bi(misconceptions, bi_model)

def initial_retrieve(df_input, doc_embeds):
    print("Initial retrieval with bi-encoder...")
    query_embeds = embed_texts_bi(df_input['Text'].tolist(), bi_model)
    print("Computing similarity for initial retrieval...")
    sim = np.dot(query_embeds, doc_embeds.T)
    top_candidates = np.argsort(sim, axis=1)[:, -INTERMEDIATE_K:][:, ::-1]
    return top_candidates, sim

#############################################
# Load Qwen Qwen2.5-Math-RM-72B as reward model
#############################################
print("Loading Qwen Qwen2.5-Math-RM-72B reward model on CPU...")
tokenizer_rm = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-RM-72B", trust_remote_code=True)
model_rm = AutoModel.from_pretrained("Qwen/Qwen2.5-Math-RM-72B", trust_remote_code=True)
model_rm.eval()
model_rm.to(device_rm)

# We'll assume the model returns a dict with "response_scores" or a similar key.
# Without official doc, we guess: model(**inputs) -> outputs with "response_scores"
# We'll just try outputs['response_scores'] after calling model.
# If needed, adjust this key based on actual model outputs.

def compute_reward_score(qtext, ctext, tokenizer, model):
    # Combine into single input
    # We'll just concatenate them. The reward model expects a single text input or multiple?
    # As a guess, we provide qtext + ctext and expect a scalar reward:
    combined_text = f"{qtext}\n[CANDIDATE_MISCONCEPTION]: {ctext}\n"
    inputs = tokenizer(combined_text, return_tensors='pt', truncation=True, max_length=256)
    for k, v in inputs.items():
        inputs[k] = v.to(device_rm)
    with torch.no_grad():
        outputs = model(**inputs)
        # Guessing the key: If it's a reward model, likely 'response_scores' or 'scores'
        # Let's try 'response_scores'.
        # If original model doc says something else, adjust here.
        reward = outputs['response_scores'].item()
    return reward

def re_rank(df_input, top_candidates):
    print("Re-ranking with Qwen reward model on CPU (batch_size=1, slow)...")
    query_texts = df_input['Text'].tolist()
    reranked_indices = []
    for i, qtext in enumerate(tqdm(query_texts, desc="Re-ranking Queries")):
        cand_indices = top_candidates[i]
        cand_texts = [misconceptions[idx] for idx in cand_indices]
        scores = []
        for ctext in cand_texts:
            score = compute_reward_score(qtext, ctext, tokenizer_rm, model_rm)
            scores.append(score)
        sorted_cand = np.argsort(scores)[::-1][:K]
        final_selection = cand_indices[sorted_cand]
        reranked_indices.append(final_selection)
    return reranked_indices

def map_at_k(y_true, y_pred, k=25):
    print("Calculating MAP@K...")
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        if true in preds[:k]:
            rank = (preds[:k] == true).nonzero()[0][0] + 1
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

print("Starting evaluation on training split...")
train_candidates, _ = initial_retrieve(df_train_input, misconception_embeds)
train_reranked = re_rank(df_train_input, train_candidates)
train_map25 = map_at_k(train_target_ids, train_reranked, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

print("Starting evaluation on validation split...")
val_candidates, _ = initial_retrieve(df_val_input, misconception_embeds)
val_reranked = re_rank(df_val_input, val_candidates)
val_map25 = map_at_k(val_target_ids, val_reranked, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

print("Generating final submission for test set...")
test_candidates, _ = initial_retrieve(df_test_input, misconception_embeds)
test_reranked = re_rank(df_test_input, test_candidates)
df_test_input["MisconceptionId"] = [" ".join(map(str, row)) for row in test_reranked]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("./data/submissions.csv", index=False)
print("submission.csv created successfully at './data/submissions.csv'!")


IS_SUBMISSION: False
Loading data...
Preparing input dataframe...
Preparing input dataframe...
Preparing input dataframe...
Loading bi-encoder model...
Embedding misconceptions with bi-encoder...
Embedding 2587 texts with bi-encoder on CPU...


Batches: 100%|██████████| 162/162 [00:10<00:00, 14.85it/s]


Loading Qwen Qwen2.5-Math-RM-72B reward model on CPU...


Downloading shards: 100%|██████████| 37/37 [34:16<00:00, 55.57s/it]
Loading checkpoint shards: 100%|██████████| 37/37 [33:04<00:00, 53.64s/it]
Some weights of the model checkpoint at Qwen/Qwen2.5-Math-RM-72B were not used when initializing Qwen2ForRewardModel: ['lm_head.weight']
- This IS expected if you are initializing Qwen2ForRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Qwen2ForRewardModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Starting evaluation on training split...
Initial retrieval with bi-encoder...
Embedding 3503 texts with bi-encoder on CPU...


Batches: 100%|██████████| 219/219 [02:14<00:00,  1.63it/s]


Computing similarity for initial retrieval...
Re-ranking with Qwen reward model on CPU (batch_size=1, slow)...


Re-ranking Queries:   0%|          | 0/3503 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Re-ranking Queries:   0%|          | 0/3503 [00:54<?, ?it/s]


KeyError: 'response_scores'