In [1]:
##############################################
# Dependencies
##############################################
import os, math, numpy as np
import pandas as pd
import re, gc
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 300)

##############################################
# Configuration
##############################################
IS_SUBMISSION = False  # Set to False to do train/val evaluation. Later, switch to True for final submission
base_model_path = "Qwen/Qwen2.5-7B-Instruct"
query_max_len, doc_max_len = 320, 48
task = "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
K = 25  # Top K misconceptions

print('IS_SUBMISSION:', IS_SUBMISSION)

##############################################
# Loading Data
##############################################
df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

# Create train/validation split from full training data
df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

# Depending on scenario:
# If we want submission: we'll set IS_SUBMISSION = True and use df_test.
# For now, we keep IS_SUBMISSION = False to evaluate on train/val.

##############################################
# Prompt Formatting Function
##############################################
TEMPLATE_INPUT_V3 = '{QUESTION}\nCorrect answer: {CORRECT_ANSWER}\nStudent wrong answer: {STUDENT_WRONG_ANSWER}'

def format_input_v3(row, wrong_choice):
    assert wrong_choice in "ABCD"
    question_text = row.get("QuestionText", "No question text provided")
    subject_name = row.get("SubjectName", "Unknown subject")
    construct_name = row.get("ConstructName", "Unknown construct")
    correct_answer = row.get("CorrectAnswer", "Unknown")
    assert wrong_choice != correct_answer
    correct_answer_text = row.get(f"Answer{correct_answer}Text", "No correct answer text available")
    wrong_answer_text = row.get(f"Answer{wrong_choice}Text", "No wrong answer text available")

    formatted_question = f"""Question: {question_text}
    
SubjectName: {subject_name}
ConstructName: {construct_name}"""

    ret = {
        "QUESTION": formatted_question,
        "CORRECT_ANSWER": correct_answer_text,
        "STUDENT_WRONG_ANSWER": wrong_answer_text,
        "MISCONCEPTION_ID": row.get(f'Misconception{wrong_choice}Id'),
    }
    ret["PROMPT"] = TEMPLATE_INPUT_V3.format(**ret)
    return ret

##############################################
# Data Preparation Function
##############################################
def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    for _, row in df.iterrows():
        for choice in ['A', 'B', 'C', 'D']:
            if choice == row["CorrectAnswer"]:
                continue
            if not is_submission and row.get(f'Misconception{choice}Id', -1) == -1:
                # Skip if we don't have a known misconception ID in training/val (for evaluation)
                continue
            item = {'QuestionId_Answer': '{}_{}'.format(row['QuestionId'], choice)}
            item['Prompt'] = format_input_v3(row, choice)['PROMPT']
            items.append(item)
            # Store ground truth ID if available
            target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
    df_input = pd.DataFrame(items)
    return df_input, target_ids

# Prepare train split input
df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)

# Prepare validation input
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)

# Prepare test input
# For test we are submitting predictions, so IS_SUBMISSION = True
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

##############################################
# Construct Queries and Documents
##############################################
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'<instruct>{task_description}\n<query>{query}'

def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
    inputs = tokenizer(
        queries,
        max_length=query_max_len - len(tokenizer('<s>', add_special_tokens=False)['input_ids']) -
        len(tokenizer('\n<response></s>', add_special_tokens=False)['input_ids']),
        return_token_type_ids=False,
        truncation=True,
        return_tensors=None,
        add_special_tokens=False
    )
    prefix_ids = tokenizer(examples_prefix, add_special_tokens=False)['input_ids']
    suffix_ids = tokenizer('\n<response>', add_special_tokens=False)['input_ids']
    new_max_length = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
    new_queries = tokenizer.batch_decode(inputs['input_ids'])
    for i in range(len(new_queries)):
        new_queries[i] = examples_prefix + new_queries[i] + '\n<response>'
    return new_max_length, new_queries

documents = df_misconception_mapping['MisconceptionName'].tolist()

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
examples_prefix = ''

##############################################
# Embedding Helper Functions
##############################################
MAX_LENGTH = query_max_len

def last_token_pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    sequence_lengths = attention_mask.sum(dim=1) - 1
    batch_size = last_hidden_states.shape[0]
    return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_embeddings_in_batches(model, tokenizer, texts, max_length, batch_size=4):
    embeddings = []
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
        batch_texts = texts[i : i + batch_size]
        batch_dict = tokenizer(
            batch_texts,
            max_length=max_length,
            padding=True,
            truncation=True,
            return_tensors="pt",
        )
        batch_dict = {k: v.to(device) for k, v in batch_dict.items()}  # ensure all on same device
        with torch.no_grad():
            outputs = model(**batch_dict, output_hidden_states=True)
            hidden_states = outputs.hidden_states[-1]  
            batch_embeddings = last_token_pool(hidden_states, batch_dict["attention_mask"])
            batch_embeddings = F.normalize(batch_embeddings, p=2, dim=1).cpu()
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

##############################################
# Load the Qwen Model
##############################################
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map=None
)

##############################################
# Compute Embeddings for Documents (Misconceptions)
##############################################
# We'll embed documents once since they'll be reused
data_docs = documents
doc_embeds = get_embeddings_in_batches(model, tokenizer, data_docs, max_length=MAX_LENGTH, batch_size=2)

##############################################
# Function to get predictions for a given df_input
##############################################
def get_predictions(df_input, doc_embeds, model, tokenizer):
    queries = [get_detailed_instruct(task, q) for q in df_input['Prompt']]
    # Generate tokenized queries
    _, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)
    # Embed queries
    query_embeds = get_embeddings_in_batches(model, tokenizer, new_queries, max_length=MAX_LENGTH, batch_size=2)
    # Compute similarity and get top K
    scores = query_embeds @ doc_embeds.T
    sorted_indices = torch.argsort(scores, dim=1, descending=True)[:, :K].tolist()
    return sorted_indices

##############################################
# Evaluate Predictions using MAP@K
##############################################
def map_at_k(y_true, y_pred, k=25):
    """Compute Mean Average Precision at K for each sample."""
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        # preds is a list of predicted misconception IDs
        # we have only one relevant misconception per QA pair in this scenario
        # Check if 'true' is in top K
        if true in preds[:k]:
            rank = preds.index(true) + 1  # 1-based rank
            # AP = 1/rank since there's only one relevant item
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

##############################################
# Get predictions and evaluate for Train Split
##############################################
train_sorted_indices = get_predictions(df_train_input, doc_embeds, model, tokenizer)
train_map25 = map_at_k(train_target_ids, train_sorted_indices, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

##############################################
# Get predictions and evaluate for Validation
##############################################
val_sorted_indices = get_predictions(df_val_input, doc_embeds, model, tokenizer)
val_map25 = map_at_k(val_target_ids, val_sorted_indices, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

##############################################
# Finally, produce submission for Test Set
##############################################
# Set IS_SUBMISSION = True if needed or just run since we have df_test
test_sorted_indices = get_predictions(df_test_input, doc_embeds, model, tokenizer)

df_test_input["MisconceptionId"] = [" ".join([str(x) for x in row]) for row in test_sorted_indices]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)
print("submission.csv created successfully!")


  from .autonotebook import tqdm as notebook_tqdm


IS_SUBMISSION: False


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.21it/s]
Embedding: 100%|██████████| 1294/1294 [00:57<00:00, 22.42it/s]
Embedding: 100%|██████████| 1752/1752 [03:05<00:00,  9.43it/s]


Train MAP@25: 0.0035


Embedding: 100%|██████████| 434/434 [00:45<00:00,  9.51it/s]


Validation MAP@25: 0.0054


Embedding: 100%|██████████| 5/5 [00:00<00:00, 11.32it/s]

submission.csv created successfully!



