In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

pd.set_option('display.max_rows', 300)

IS_SUBMISSION = False
K = 25
device = 'cpu'  # Use CPU due to huge model size. If you have large GPU memory, try 'cuda:5'.

print('IS_SUBMISSION:', IS_SUBMISSION)

# Clear CUDA if needed
if torch.cuda.is_available():
    torch.cuda.empty_cache()

df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    for _, row in df.iterrows():
        for choice in ['A', 'B', 'C', 'D']:
            if choice == row["CorrectAnswer"]:
                continue
            if not is_submission and row.get(f'Misconception{choice}Id', -1) == -1:
                continue
            q_id_ans = f"{row['QuestionId']}_{choice}"
            # Provide math context, remove LaTeX for simplicity
            question_text = str(row.get('QuestionText','')).replace("\\(","").replace("\\)","").replace("\\[","").replace("\\]","")
            full_context = (
                f"Math problem:\n"
                f"ConstructName: {row.get('ConstructName','')}\n"
                f"SubjectName: {row.get('SubjectName','')}\n"
                f"Q: {question_text}\n"
                f"A) {row.get('AnswerAText','')}\nB) {row.get('AnswerBText','')}\n"
                f"C) {row.get('AnswerCText','')}\nD) {row.get('AnswerDText','')}\n"
                f"CorrectAnswer: {row.get('CorrectAnswer','')}\n"
                f"ChosenWrongAnswer: {choice}\n"
                f"ChosenWrongAnswerText: {row.get(f'Answer{choice}Text','')}\n"
                "Identify the related math misconception."
            )
            items.append({'QuestionId_Answer': q_id_ans, 'Text': full_context})
            target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
    df_input = pd.DataFrame(items)
    return df_input, target_ids

df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

misconceptions = df_misconception_mapping['MisconceptionName'].astype(str).tolist()

#############################################
# Load Qwen Math Model (Huge!)
#############################################
# NOTE: This model is extremely large. Running it locally without powerful hardware might OOM.
# We try CPU load here.
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-RM-72B", trust_remote_code=True)
model = AutoModel.from_pretrained("Qwen/Qwen2.5-Math-RM-72B", trust_remote_code=True)
model = model.to(device)
model.eval()

#############################################
# Embedding Extraction Function
#############################################
# We'll get the last hidden state for embeddings and do a simple mean pooling.
def embed_texts(texts, tokenizer, model, device, batch_size=1):
    all_embeds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batches"):
        batch_texts = texts[i : i + batch_size]
        batch = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=256)
        for k,v in batch.items():
            batch[k] = v.to(device)
        with torch.no_grad():
            outputs = model(**batch, output_hidden_states=True)
            # outputs.hidden_states[-1] is the last layer (if available)
            # If output_hidden_states might not be supported by default, you may need to enable it in config.
            # If not supported, just use outputs.last_hidden_state if available.
            
            # If Qwen RM model returns only logits, we need to adapt. We assume it returns hidden_states for now.
            hidden_states = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_dim)
            # Mean pool
            mask = batch['attention_mask'].unsqueeze(-1).expand(hidden_states.size()).float()
            sum_emb = torch.sum(hidden_states * mask, dim=1)
            sum_mask = torch.clamp(mask.sum(dim=1), min=1e-9)
            mean_emb = sum_emb / sum_mask
            mean_emb = mean_emb.cpu().numpy()
            all_embeds.append(mean_emb)
    return np.concatenate(all_embeds, axis=0)

#############################################
# Compute Embeddings
#############################################
# Due to huge model and memory constraints, use batch_size=1 and CPU
misconception_embeds = embed_texts(misconceptions, tokenizer, model, device, batch_size=1)

def get_predictions(df_input, doc_embeds):
    query_embeds = embed_texts(df_input['Text'].tolist(), tokenizer, model, device, batch_size=1)
    sim = np.matmul(query_embeds, doc_embeds.T)
    top_k_indices = np.argsort(sim, axis=1)[:, -K:][:, ::-1]
    return top_k_indices

def map_at_k(y_true, y_pred, k=25):
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        if true in preds[:k]:
            rank = (preds[:k] == true).nonzero()[0][0] + 1
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

# Evaluate Train
train_sorted_indices = get_predictions(df_train_input, misconception_embeds)
train_map25 = map_at_k(train_target_ids, train_sorted_indices, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

# Evaluate Val
val_sorted_indices = get_predictions(df_val_input, misconception_embeds)
val_map25 = map_at_k(val_target_ids, val_sorted_indices, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

# Submission for Test
test_sorted_indices = get_predictions(df_test_input, misconception_embeds)
df_test_input["MisconceptionId"] = [" ".join(map(str, row)) for row in test_sorted_indices]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("./data/submissionsv7.csv", index=False)
print("submission.csv created successfully at './data/submissionsv7.csv'!")


  from .autonotebook import tqdm as notebook_tqdm


IS_SUBMISSION: False


A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B:
- configuration_qwen2_rm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B:
- modeling_qwen2_rm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|██████████| 37/37 [58:04<00:00, 94.17s/it]
Loading checkpoint shards: 100%|██████████| 37/37 [33:04<00:00, 53.63s/it]
Some weights of the model checkpoint at Qwen/Qwen2.5-Math-RM-72B were not used when initializing Qwen2ForRewardModel: ['lm_head.weight']
- This IS expected if you are initializing Qwen2ForRewardModel from the checkpoint of a model trained on another task or with another architecture (e.g. init

KeyboardInterrupt: 