In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import torch

pd.set_option('display.max_rows', 300)

IS_SUBMISSION = False
K = 25

# Use the requested model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Smaller model

print('IS_SUBMISSION:', IS_SUBMISSION)

device = 'cuda:4' if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load data
df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

# Guidance text about misconceptions
MISCONCEPTION_GUIDANCE = (
    "A misconception is a commonly misunderstood concept in mathematics. "
    "Your task: Identify the math misconception related to the chosen wrong answer."
)

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    # Remove LaTeX markers
    text = text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    # Normalize whitespace
    text = " ".join(text.split())
    return text

def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    for _, row in df.iterrows():
        correct_answer = row.get("CorrectAnswer", "")
        correct_answer_text = clean_text(row.get(f"Answer{correct_answer}Text",""))
        
        for choice in ['A', 'B', 'C', 'D']:
            if choice == correct_answer:
                continue
            if not is_submission and row.get(f'Misconception{choice}Id', -1) == -1:
                continue

            q_id_ans = f"{row['QuestionId']}_{choice}"

            question_text = clean_text(row.get('QuestionText',''))
            ansA = clean_text(row.get('AnswerAText',''))
            ansB = clean_text(row.get('AnswerBText',''))
            ansC = clean_text(row.get('AnswerCText',''))
            ansD = clean_text(row.get('AnswerDText',''))
            cName = clean_text(row.get('ConstructName',''))
            sName = clean_text(row.get('SubjectName',''))
            chosen_wrong = clean_text(row.get(f'Answer{choice}Text',''))

            # Add structured tags and highlight chosen wrong answer
            full_context = (
                f"[SUBJECT]: {sName}\n"
                f"[CONSTRUCT]: {cName}\n"
                f"[QUESTION]: {question_text}\n"
                f"[ANSWERS]: A) {ansA} B) {ansB} C) {ansC} D) {ansD}\n"
                f"[CORRECT_ANSWER]: {correct_answer_text}\n"
                f"[CHOSEN_WRONG_ANSWER]: <<{chosen_wrong}>>\n"
                f"[GUIDANCE]: {MISCONCEPTION_GUIDANCE}"
            )

            items.append({'QuestionId_Answer': q_id_ans, 'Text': full_context})
            target_ids.append(int(row.get(f'Misconception{choice}Id', -1)))
    df_input = pd.DataFrame(items)
    return df_input, target_ids

df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

misconceptions = df_misconception_mapping['MisconceptionName'].astype(str).tolist()

# Load the chosen sentence-transformer model
model = SentenceTransformer(embedding_model_name, device=device)

def embed_texts(texts, model, batch_size=8):
    return model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

print("Embedding misconceptions...")
misconception_embeds = embed_texts(misconceptions, model)

def get_predictions(df_input, doc_embeds):
    query_embeds = embed_texts(df_input['Text'].tolist(), model)
    sim = np.matmul(query_embeds, doc_embeds.T)
    top_k_indices = np.argsort(sim, axis=1)[:, -K:][:, ::-1]
    return top_k_indices

def map_at_k(y_true, y_pred, k=25):
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        if true in preds[:k]:
            rank = (preds[:k] == true).nonzero()[0][0] + 1
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

# Evaluate on Training Split
train_sorted_indices = get_predictions(df_train_input, misconception_embeds)
train_map25 = map_at_k(train_target_ids, train_sorted_indices, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

# Evaluate on Validation
val_sorted_indices = get_predictions(df_val_input, misconception_embeds)
val_map25 = map_at_k(val_target_ids, val_sorted_indices, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

# Generate Test Submission
test_sorted_indices = get_predictions(df_test_input, misconception_embeds)
df_test_input["MisconceptionId"] = [" ".join(map(str, row)) for row in test_sorted_indices]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("submissionv12.csv", index=False)
print("submissionv12.csv created successfully!")


  from .autonotebook import tqdm as notebook_tqdm


IS_SUBMISSION: False
Embedding misconceptions...


Batches: 100%|██████████| 324/324 [00:01<00:00, 168.09it/s]
Batches: 100%|██████████| 438/438 [00:04<00:00, 106.85it/s]


Train MAP@25: 0.1471


Batches: 100%|██████████| 109/109 [00:00<00:00, 111.73it/s]


Validation MAP@25: 0.1546


Batches: 100%|██████████| 2/2 [00:00<00:00, 69.32it/s]


submissionv12.csv created successfully!
