In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
# Import required libraries
import pandas as pd
from utils import (
    load_qwen_with_lora, prepare_items, get_detailed_instruct, 
    get_new_queries, save_data
)
print(1)
# Configuration
IS_SUBMISSION = True
base_model_path = "Qwen/Qwen2.5-7B-Instruct"  # Base model
lora_path = "saved_models/lora-14b-1126"  # LoRA adapter from Hugging Face
query_max_len, doc_max_len = 320, 48
examples_prefix = ""
task = "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
print(2)
# Load datasets
df_train = pd.read_csv("./data/train.csv").fillna(-1).sample(10, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")
print(3)
# Select the appropriate dataset
df_ret = df_test if IS_SUBMISSION else df_train

# Prepare items and targets
df_input, target_ids = prepare_items(df_ret, IS_SUBMISSION)
print(4)
# Prepare queries and documents
queries = [get_detailed_instruct(task, q) for q in df_input['Prompt']]
documents = df_misconception_mapping['MisconceptionName'].tolist()
print(5)
# Load Qwen model with LoRA integration
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Load the base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype="auto", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)

# Integrate the LoRA adapter
model = PeftModel.from_pretrained(model, lora_path)
print(6)
# Tokenize queries and generate new queries
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)

# Save the tokenized data to a JSON file
save_data(new_queries, documents)


  from .autonotebook import tqdm as notebook_tqdm


1
2
3
4
5


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.63it/s]
Some parameters are on the meta device because they were offloaded to the cpu.


6




In [3]:
import json

with open("data/data.json", "r") as f:
    data = json.load(f)

# Inspect the keys and sample entries
print(data.keys())
print(data["texts"][:5])


dict_keys(['texts'])
["<instruct>Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions\n<query>Question: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n\nSubjectName: BIDMAS\nConstructName: Use the order of operations to carry out calculations involving powers\nCorrect answer: \\( 3 \\times(2+4)-5 \\)\nStudent wrong answer: \\( 3 \\times 2+(4-5) \\)\n<response>", "<instruct>Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions\n<query>Question: \\[\n3 \\times 2+4-5\n\\]\nWhere do the brackets need to go to make the answer equal \\( 13 \\) ?\n\nSubjectName: BIDMAS\nConstructName: Use the order of operations to carry out calculations involving powers\nCorrect answer: \\( 3 \\times(2+4)-5 \\)\nStudent wrong answer: \\( 3 \\times(2+4-5) \\)\n<response>", "<instruct>Given a math multiple-choice problem with a student's wrong answer, retrieve the m

In [4]:
# Step 7: Load the tokenized data
import json

with open("data.json", "r") as f:
    data = json.load(f)

queries = data['texts'][:len(new_queries)]
documents = data['texts'][len(new_queries):]

# Step 8: Encode queries and documents
from transformers import pipeline

# Create an embedding pipeline using the loaded tokenizer and model
embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer, device=0)

query_embeddings = embedder(queries)
document_embeddings = embedder(documents)

# Step 9: Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = cosine_similarity(query_embeddings, document_embeddings)

# Step 10: Retrieve top misconceptions
import numpy as np

top_k = 5  # Number of top misconceptions to retrieve
results = []
for i, scores in enumerate(similarity_scores):
    top_indices = np.argsort(scores)[-top_k:][::-1]
    top_docs = [documents[idx] for idx in top_indices]
    results.append({
        "Query": queries[i],
        "TopMisconceptions": top_docs
    })

# Step 11: Save results for submission
output_file = "submission.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {output_file}")


ValueError: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.