In [1]:
import re
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# embed_model_pth = cfg.MODEL_OUTPUT_PATH
embed_model_pth ="../model_output/trained_model"

In [3]:
test = pd.read_csv("../competitions_data/test.csv")
train = pd.read_csv("../competitions_data/train.csv")
misconception_mapping = pd.read_csv("../competitions_data/misconception_mapping.csv")

In [4]:
train = train.dropna(axis=1)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(embed_model_pth)
model     = AutoModel.from_pretrained(embed_model_pth).to("cuda:0")

In [6]:
# https://www.kaggle.com/code/pshikk/similarity-preprocessing

def preprocess_text(x):
    x = x.lower()                 # Convert words to lowercase
    x = re.sub("@\w+", '',x)      # Delete strings starting with @
    x = re.sub("'\d+", '',x)      # Delete Numbers
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\s+", " ", x)    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

In [7]:
def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    """ Function to generate embeddings """
    
    all_embeddings = []
    texts = [preprocess_text(text) for text in texts] # This was absent in the original code
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

In [8]:
# Generate embeddings for misconceptions
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
all_ctx_vector = generate_embeddings(MisconceptionName, model, tokenizer, "cuda:0")

In [9]:
# Prepare test data
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["SubjectName"] + "\n\n" + df["ConstructName"] + "\n\n" + df["QuestionText"]
    df["all_question_text"] = df["all_question_text"].apply(preprocess_text)
    return df

In [10]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name   = 'Answer',
        value_name = 'value'
    )
    return df

In [11]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    text_components = []
    if "all_question_text" in df.columns:
        text_components.append(df["all_question_text"])
    if "value" in df.columns:
        text_components.append(df["value"].apply(preprocess_text))
    
    df["all_text"] = pd.concat(text_components, axis=1).apply(lambda x: '\n\n'.join(x.dropna().astype(str)), axis=1)
    return df

In [12]:
def simple_cosine(all_text_vector, all_ctx_vector):
    temp =  cosine_similarity(all_text_vector, all_ctx_vector)
    return temp

def cdist_similarity(all_text_vector, all_ctx_vector, m ):
    dist = cdist(all_text_vector, all_ctx_vector, metric = m )
    return 1 / (1 + dist)  # Convert distance to similarity
#     return np.argsort(-temp, axis=1)

In [13]:
test = make_all_question_text(test)
test_long = wide_to_long(test)
test_long = make_all_text(test_long)
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)

# Generate embeddings for test data
test_texts = list(test_long['all_text'].values)
all_text_vector = generate_embeddings(test_texts, model, tokenizer, "cuda:0")

e_sim = cdist_similarity(all_text_vector, all_ctx_vector, 'euclidean')

si = e_sim 

sim = np.argsort(-si,axis=1)

# Prepare submission
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = sim[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))

# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]

In [14]:
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [15]:
submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2306 1507 706 2488 1005 1516 328 2532 1672 196...
1,1869_C,2306 1507 706 2488 1005 328 1516 1672 2532 208...
2,1869_D,2306 1507 2488 706 1005 2532 1516 638 1392 205...
3,1870_A,2142 2068 1535 167 547 1755 891 1593 1256 2398...
4,1870_B,2142 2068 1535 167 547 1755 891 1593 1256 2398...
5,1870_C,2068 2142 1755 167 891 1535 547 2398 885 1593 ...
6,1871_A,1287 1073 2439 397 365 1923 1349 1677 2551 231...
7,1871_C,1287 1073 2439 397 365 1923 1349 1677 2319 255...
8,1871_D,1287 1073 2439 397 365 1923 1349 1677 2551 231...


In [16]:
submission.to_csv("../competitions_data/submission.csv", index=False)