## Dependencies

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, CrossEncoder
from tqdm import tqdm
import torch
from utils import *
pd.set_option('display.max_rows', 300)

IS_SUBMISSION = False
K = 25
INTERMEDIATE_K = 200  # Increased from 100 to 200

device_bi = 'cuda:1' if torch.cuda.is_available() else 'cpu'
device_ce = 'cuda:1' if torch.cuda.is_available() else 'cpu'

print('IS_SUBMISSION:', IS_SUBMISSION)

torch.cuda.empty_cache()


In [None]:
print("Loading data...")
df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

misconceptions = df_misconception_mapping['MisconceptionName'].astype(str).tolist()

MISCONCEPTION_GUIDANCE = ("A misconception is a commonly misunderstood concept in mathematics. "
                          "Your task: Identify the math misconception related to the chosen wrong answer.")


## Helper functions

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    text = " ".join(text.split())
    return text

def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    print("Preparing input dataframe...")
    for _, row in df.iterrows():
        correct_answer = row.get("CorrectAnswer", "")
        correct_answer_text = clean_text(row.get(f"Answer{correct_answer}Text",""))
        for choice in ['A', 'B', 'C', 'D']:
            if choice == correct_answer:
                continue
            misconception_id = row.get(f'Misconception{choice}Id', -1)
            if not is_submission and misconception_id == -1:
                continue

            q_id_ans = f"{row['QuestionId']}_{choice}"
            
            question_text = clean_text(row.get('QuestionText',''))
            cName = clean_text(row.get('ConstructName',''))
            sName = clean_text(row.get('SubjectName',''))
            chosen_wrong = clean_text(row.get(f'Answer{choice}Text',''))
            
            # Focus on question, correct and chosen wrong answer only
            # Emphasize wrong answer by special tokens
            full_context = (
                f"[SUBJECT]: {sName}\n"
                f"[CONSTRUCT]: {cName}\n"
                f"[QUESTION]: {question_text}\n"
                f"[CORRECT_ANSWER]: {correct_answer_text}\n"
                f"[CHOSEN_WRONG_ANSWER]: <<{chosen_wrong}>>\n"
                f"[GUIDANCE]: {MISCONCEPTION_GUIDANCE}"
            )
            
            items.append({'QuestionId_Answer': q_id_ans, 'Text': full_context})
            target_ids.append(int(misconception_id))
    return pd.DataFrame(items), target_ids



## Model and evaluation

In [None]:
df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

print("Loading bi-encoder model...")
bi_model_name = "sentence-transformers/all-mpnet-base-v2"
bi_model = SentenceTransformer(bi_model_name, device=device_bi)

print("Loading cross-encoder model...")
# A slightly larger cross-encoder for better re-ranking performance
ce_model_name = "cross-encoder/ms-marco-MiniLM-L-12-v2"
ce_model = CrossEncoder(ce_model_name, device=device_ce)


In [None]:
def embed_texts_bi(texts, model, batch_size=16):
    print(f"Embedding {len(texts)} texts with bi-encoder...")
    return model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

print("Embedding misconceptions with bi-encoder...")
misconception_embeds = embed_texts_bi(misconceptions, bi_model)

def initial_retrieve(df_input, doc_embeds):
    print("Initial retrieval with bi-encoder...")
    query_embeds = embed_texts_bi(df_input['Text'].tolist(), bi_model)
    print("Computing similarity for initial retrieval...")
    sim = np.dot(query_embeds, doc_embeds.T)
    top_candidates = np.argsort(sim, axis=1)[:, -INTERMEDIATE_K:][:, ::-1]
    return top_candidates, sim

def re_rank(df_input, top_candidates):
    print("Re-ranking with cross-encoder...")
    query_texts = df_input['Text'].tolist()
    reranked_indices = []
    all_pairs = []
    all_offsets = []
    offset = 0
    for i, qtext in enumerate(query_texts):
        cand_indices = top_candidates[i]
        cand_texts = [misconceptions[idx] for idx in cand_indices]
        pairs = [(qtext, ctext) for ctext in cand_texts]
        all_pairs.extend(pairs)
        all_offsets.append((offset, offset+len(pairs), cand_indices))
        offset += len(pairs)

    print("Predicting scores with cross-encoder...")
    scores = ce_model.predict(all_pairs)
    print("Done cross-encoder predictions, now sorting results...")
    for (start, end, cand_indices) in all_offsets:
        sub_scores = scores[start:end]
        sorted_cand = np.argsort(sub_scores)[::-1][:K]
        final_selection = cand_indices[sorted_cand]
        reranked_indices.append(final_selection)
    return reranked_indices

def map_at_k(y_true, y_pred, k=25):
    print("Calculating MAP@K...")
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        if true in preds[:k]:
            rank = (preds[:k] == true).nonzero()[0][0] + 1
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

In [1]:

print("Starting evaluation on training split...")
train_candidates, _ = initial_retrieve(df_train_input, misconception_embeds)
train_reranked = re_rank(df_train_input, train_candidates)
train_map25 = map_at_k(train_target_ids, train_reranked, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

print("Starting evaluation on validation split...")
val_candidates, _ = initial_retrieve(df_val_input, misconception_embeds)
val_reranked = re_rank(df_val_input, val_candidates)
val_map25 = map_at_k(val_target_ids, val_reranked, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

print("Generating final submission for test set...")
test_candidates, _ = initial_retrieve(df_test_input, misconception_embeds)
test_reranked = re_rank(df_test_input, test_candidates)
df_test_input["MisconceptionId"] = [" ".join(map(str, row)) for row in test_reranked]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("./data/submissionsv9.csv", index=False)
print("submission.csv created successfully at './data/submissionsv9.csv'!")


  from .autonotebook import tqdm as notebook_tqdm


IS_SUBMISSION: False
Loading data...
Preparing input dataframe...
Preparing input dataframe...
Preparing input dataframe...
Loading bi-encoder model...
Loading cross-encoder model...
Embedding misconceptions with bi-encoder...
Embedding 2587 texts with bi-encoder...


Batches: 100%|██████████| 162/162 [00:02<00:00, 79.73it/s] 


Starting evaluation on training split...
Initial retrieval with bi-encoder...
Embedding 3503 texts with bi-encoder...


Batches: 100%|██████████| 219/219 [00:15<00:00, 14.02it/s]


Computing similarity for initial retrieval...
Re-ranking with cross-encoder...
Predicting scores with cross-encoder...
Done cross-encoder predictions, now sorting results...
Calculating MAP@K...
Train MAP@25: 0.1169
Starting evaluation on validation split...
Initial retrieval with bi-encoder...
Embedding 867 texts with bi-encoder...


Batches: 100%|██████████| 55/55 [00:03<00:00, 13.91it/s]


Computing similarity for initial retrieval...
Re-ranking with cross-encoder...
Predicting scores with cross-encoder...
Done cross-encoder predictions, now sorting results...
Calculating MAP@K...
Validation MAP@25: 0.1045
Generating final submission for test set...
Initial retrieval with bi-encoder...
Embedding 9 texts with bi-encoder...


Batches: 100%|██████████| 1/1 [00:00<00:00, 20.39it/s]

Computing similarity for initial retrieval...
Re-ranking with cross-encoder...
Predicting scores with cross-encoder...





Done cross-encoder predictions, now sorting results...
submission.csv created successfully at './data/submissionsv9.csv'!


In [None]:
##############################################
# Combined Code Incorporating Multiple Methods
##############################################

# Methods Used:
# 1) Add Structured Features to the Text Representation
#    - We will prepend tags like [SUBJECT], [CONSTRUCT].
#    - Highlight the wrong answer text distinctly.
#    - Remove LaTeX and normalize formatting.
#
# 2) Two-Stage Retrieval (Re-rank with a Cross-Encoder)
#    - Stage 1: Use a sentence-transformer bi-encoder for initial retrieval of top 100 misconceptions.
#    - Stage 2: Use a cross-encoder to re-rank these top 100 misconceptions and pick the top 25.
#
# 3) Incorporate Misconception Definitions into Query
#    - Add a generic definition of a "misconception" or guidance text to the query prompt.
#
# 4) Filter or Normalize Input Text
#    - Remove LaTeX markers and any complicated formatting from the question text and answers.
#
# Model Choices:
# - For the initial retrieval (Stage 1), we use a sentence-transformer model suitable for semantic search:
#   "sentence-transformers/all-mpnet-base-v2" (good trade-off between performance and model size).
#
# - For the cross-encoder (Stage 2), we use a model specialized for re-ranking:
#   "cross-encoder/ms-marco-MiniLM-L-6-v2"
#
# Adjust batch sizes and device as needed to handle memory constraints.

##############################################
# Attempt with Different Strategies:
# - Larger cross-encoder (e.g. "cross-encoder/ms-marco-MiniLM-L-12-v2")
# - Only use correct and chosen wrong answer in the query to reduce noise
# - Increase intermediate retrieval to 200
# - Emphasize wrong answer
##############################################