In [1]:
##############################################
# Combined Code Incorporating Multiple Methods
##############################################

# Methods Used:
# 1) Add Structured Features to the Text Representation
#    - We will prepend tags like [SUBJECT], [CONSTRUCT].
#    - Highlight the wrong answer text distinctly.
#    - Remove LaTeX and normalize formatting.
#
# 2) Two-Stage Retrieval (Re-rank with a Cross-Encoder)
#    - Stage 1: Use a sentence-transformer bi-encoder for initial retrieval of top 100 misconceptions.
#    - Stage 2: Use a cross-encoder to re-rank these top 100 misconceptions and pick the top 25.
#
# 3) Incorporate Misconception Definitions into Query
#    - Add a generic definition of a "misconception" or guidance text to the query prompt.
#
# 4) Filter or Normalize Input Text
#    - Remove LaTeX markers and any complicated formatting from the question text and answers.
#
# Model Choices:
# - For the initial retrieval (Stage 1), we use a sentence-transformer model suitable for semantic search:
#   "sentence-transformers/all-mpnet-base-v2" (good trade-off between performance and model size).
#
# - For the cross-encoder (Stage 2), we use a model specialized for re-ranking:
#   "cross-encoder/ms-marco-MiniLM-L-6-v2"
#
# Adjust batch sizes and device as needed to handle memory constraints.

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, CrossEncoder
from tqdm import tqdm
import torch

pd.set_option('display.max_rows', 300)

IS_SUBMISSION = False
K = 25  # final top K after re-ranking
INTERMEDIATE_K = 100  # number of top misconceptions to retrieve before cross-encoder re-ranking

# Set devices (if GPU memory is limited, consider cpu for large steps)
device_bi = 'cuda:4' if torch.cuda.is_available() else 'cpu'
device_ce = 'cuda:4' if torch.cuda.is_available() else 'cpu'

print('IS_SUBMISSION:', IS_SUBMISSION)

torch.cuda.empty_cache()

##############################################
# Load Data
##############################################
df_full_train = pd.read_csv("./data/train.csv").fillna(-1)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")

df_train_split, df_val = train_test_split(df_full_train, test_size=0.2, random_state=42)
df_train_split = df_train_split.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

misconceptions = df_misconception_mapping['MisconceptionName'].astype(str).tolist()

##############################################
# Text Preprocessing and Query Formation
##############################################
def clean_text(text):
    # Remove LaTeX markers and excessive whitespace
    if not isinstance(text, str):
        text = str(text)
    text = text.replace("\\(", "").replace("\\)", "").replace("\\[", "").replace("\\]", "")
    text = " ".join(text.split())  # normalize whitespace
    return text

# We'll add structured tags and a misconception definition:
MISCONCEPTION_GUIDANCE = ("A misconception is a commonly misunderstood concept in mathematics. "
                          "Identify the math misconception related to the chosen wrong answer.")

def prepare_input_df(df, is_submission):
    items = []
    target_ids = []
    for _, row in df.iterrows():
        correct_answer = row.get("CorrectAnswer", "")
        for choice in ['A', 'B', 'C', 'D']:
            if choice == correct_answer:
                continue
            misconception_id = row.get(f'Misconception{choice}Id', -1)
            if not is_submission and misconception_id == -1:
                # Skip cases without known misconception for evaluation
                continue

            q_id_ans = f"{row['QuestionId']}_{choice}"
            
            # Clean texts
            question_text = clean_text(row.get('QuestionText',''))
            ansA = clean_text(row.get('AnswerAText',''))
            ansB = clean_text(row.get('AnswerBText',''))
            ansC = clean_text(row.get('AnswerCText',''))
            ansD = clean_text(row.get('AnswerDText',''))
            cName = clean_text(row.get('ConstructName',''))
            sName = clean_text(row.get('SubjectName',''))
            chosen_wrong = clean_text(row.get(f'Answer{choice}Text',''))

            # Structured representation:
            # We use tags and highlight wrong answer.
            # Also incorporate the misconception guidance.
            full_context = (
                f"[SUBJECT]: {sName}\n"
                f"[CONSTRUCT]: {cName}\n"
                f"[QUESTION]: {question_text}\n"
                f"[ANSWERS]: A) {ansA} B) {ansB} C) {ansC} D) {ansD}\n"
                f"[CORRECT_ANSWER]: {correct_answer}\n"
                f"[CHOSEN_WRONG_ANSWER]: {choice} => {chosen_wrong}\n"
                f"[GUIDANCE]: {MISCONCEPTION_GUIDANCE}"
            )
            
            items.append({'QuestionId_Answer': q_id_ans, 'Text': full_context})
            target_ids.append(int(misconception_id))
    df_input = pd.DataFrame(items)
    return df_input, target_ids

df_train_input, train_target_ids = prepare_input_df(df_train_split, is_submission=False)
df_val_input, val_target_ids = prepare_input_df(df_val, is_submission=False)
df_test_input, _ = prepare_input_df(df_test, is_submission=True)

##############################################
# Loading Models
##############################################
# Stage 1: Bi-Encoder for initial retrieval
bi_model_name = "sentence-transformers/all-mpnet-base-v2"
bi_model = SentenceTransformer(bi_model_name, device=device_bi)

# Stage 2: Cross-Encoder for re-ranking top candidates
ce_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
ce_model = CrossEncoder(ce_model_name, device=device_ce)

##############################################
# Embedding Functions
##############################################
def embed_texts_bi(texts, model, batch_size=16):
    return model.encode(texts, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=True)

# Pre-embed misconceptions with bi-encoder
misconception_embeds = embed_texts_bi(misconceptions, bi_model)

def initial_retrieve(df_input, doc_embeds):
    query_embeds = embed_texts_bi(df_input['Text'].tolist(), bi_model)
    sim = np.dot(query_embeds, doc_embeds.T)
    top_candidates = np.argsort(sim, axis=1)[:, -INTERMEDIATE_K:][:, ::-1]  # top 100
    return top_candidates, sim

def re_rank(df_input, top_candidates):
    # Re-rank using cross-encoder.
    # For each query, we have top_candidates: indices of misconceptions
    # Create pairs (query_text, misconception_text)
    query_texts = df_input['Text'].tolist()
    reranked_indices = []
    
    for i, qtext in enumerate(query_texts):
        cand_indices = top_candidates[i]
        cand_texts = [misconceptions[idx] for idx in cand_indices]
        pairs = [(qtext, ctext) for ctext in cand_texts]
        scores = ce_model.predict(pairs)  # higher = more relevant
        sorted_cand = np.argsort(scores)[::-1][:K]  # top 25 after re-ranking
        final_selection = cand_indices[sorted_cand]
        reranked_indices.append(final_selection)
    return reranked_indices

##############################################
# MAP@K Calculation
##############################################
def map_at_k(y_true, y_pred, k=25):
    average_precisions = []
    for true, preds in zip(y_true, y_pred):
        if true in preds[:k]:
            rank = (preds[:k] == true).nonzero()[0][0] + 1
            ap = 1.0 / rank
        else:
            ap = 0.0
        average_precisions.append(ap)
    return np.mean(average_precisions) if average_precisions else 0.0

##############################################
# Evaluate on Training Split
##############################################
train_candidates, _ = initial_retrieve(df_train_input, misconception_embeds)
train_reranked = re_rank(df_train_input, train_candidates)
train_map25 = map_at_k(train_target_ids, train_reranked, k=25)
print(f"Train MAP@25: {train_map25:.4f}")

##############################################
# Evaluate on Validation
##############################################
val_candidates, _ = initial_retrieve(df_val_input, misconception_embeds)
val_reranked = re_rank(df_val_input, val_candidates)
val_map25 = map_at_k(val_target_ids, val_reranked, k=25)
print(f"Validation MAP@25: {val_map25:.4f}")

##############################################
# Final Submission
##############################################
test_candidates, _ = initial_retrieve(df_test_input, misconception_embeds)
test_reranked = re_rank(df_test_input, test_candidates)
df_test_input["MisconceptionId"] = [" ".join(map(str, row)) for row in test_reranked]
df_test_input[["QuestionId_Answer", "MisconceptionId"]].to_csv("./data/submissionsv8.csv", index=False)
print("submission.csv created successfully at './data/submissionv8.csv'!")


  from .autonotebook import tqdm as notebook_tqdm


IS_SUBMISSION: False


Batches: 100%|██████████| 162/162 [00:02<00:00, 72.77it/s] 
Batches: 100%|██████████| 219/219 [00:19<00:00, 11.17it/s]


Train MAP@25: 0.1297


Batches: 100%|██████████| 55/55 [00:05<00:00, 10.72it/s]


Validation MAP@25: 0.1174


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.10it/s]


submission.csv created successfully at './data/submissionv8.csv'!
