In [1]:
import os
import json
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch
from utils import (
    load_qwen_with_lora, prepare_items, get_detailed_instruct, 
    get_new_queries, save_data
)

torch.cuda.empty_cache()
print("CUDA memory cleared.")
# Set CUDA device and configure memory management
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("CUDA device set to 4 and memory configuration adjusted.")

# Configuration
print("Setting up configuration...")
IS_SUBMISSION = True
base_model_path = "Qwen/Qwen2.5-7B-Instruct"  # Base model path
lora_path = "saved_models/lora-14b-1126"  # LoRA adapter path
query_max_len, doc_max_len = 320, 48
examples_prefix = ""
task = "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
print("Configuration setup completed.")

# Step 1: Load datasets
print("Loading datasets...")
df_train = pd.read_csv("./data/train.csv").fillna(-1).sample(10, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")
print("Datasets loaded successfully.")

# Step 2: Select the appropriate dataset
df_ret = df_test if IS_SUBMISSION else df_train
print(f"Dataset selected for {'submission' if IS_SUBMISSION else 'training'}.")

# Step 3: Prepare items and targets
print("Preparing items and targets...")
df_input, target_ids = prepare_items(df_ret, IS_SUBMISSION)
print(f"Prepared {len(df_input)} items and targets.")

# Step 4: Prepare queries and documents
print("Preparing queries and documents...")
queries = [get_detailed_instruct(task, q) for q in df_input['Prompt']]
documents = df_misconception_mapping['MisconceptionName'].tolist()
print(f"Prepared {len(queries)} queries and {len(documents)} documents.")

# Step 5: Load Qwen model with LoRA integration
print("Loading Qwen model with LoRA integration...")
model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model, tokenizer = load_qwen_with_lora(base_model_path, lora_path)
print("Model and tokenizer loaded successfully.")

# Step 6: Tokenize queries and generate new queries
print("Tokenizing queries and generating new queries...")
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)
print(f"Tokenized {len(new_queries)} new queries.")

# Step 7: Save the tokenized data to a JSON file
print("Saving tokenized data to 'data.json'...")
save_data(new_queries, documents)
print("Tokenized data saved successfully.")

# Step 8: Load the tokenized data
print("Loading tokenized data from 'data.json'...")
with open("data.json", "r") as f:
    data = json.load(f)

queries = data['texts'][:len(new_queries)]
documents = data['texts'][len(new_queries):]
print(f"Loaded {len(queries)} queries and {len(documents)} documents from 'data.json'.")

# Step 9: Encode queries and documents with reduced batch size
print("Encoding queries and documents...")
embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer, framework="pt")

# Process queries and documents in smaller batches to reduce memory usage
def batch_process(items, batch_size):
    for i in range(0, len(items), batch_size):
        yield items[i:i+batch_size]

query_embeddings = []
for batch in batch_process(queries, batch_size=2):  # Adjust batch size if needed
    query_embeddings.extend(embedder(batch))

print("Queries encoded successfully.")

document_embeddings = []
for batch in batch_process(documents, batch_size=2):
    document_embeddings.extend(embedder(batch))

print("Documents encoded successfully.")

# Step 10: Calculate cosine similarity
print("Calculating cosine similarity...")

# Ensure embeddings are 2D arrays
query_embeddings = np.squeeze(query_embeddings)
document_embeddings = np.squeeze(document_embeddings)

similarity_scores = cosine_similarity(query_embeddings, document_embeddings)
print("Cosine similarity calculated.")

# Step 11: Retrieve top misconceptions
print("Retrieving top misconceptions...")
top_k = 5  # Number of top misconceptions to retrieve
results = []
for i, scores in enumerate(similarity_scores):
    top_indices = np.argsort(scores)[-top_k:][::-1]
    top_docs = [documents[idx] for idx in top_indices]
    results.append({
        "Query": queries[i],
        "TopMisconceptions": top_docs
    })
print("Top misconceptions retrieved.")

# Step 12: Save results for submission
print("Saving results to 'submission.json'...")
output_file = "submission.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)
print(f"Results saved to {output_file}.")


  from .autonotebook import tqdm as notebook_tqdm


CUDA memory cleared.
CUDA device set to 4 and memory configuration adjusted.
Setting up configuration...
Configuration setup completed.
Loading datasets...
Datasets loaded successfully.
Dataset selected for submission.
Preparing items and targets...
Prepared 9 items and targets.
Preparing queries and documents...
Prepared 9 queries and 2587 documents.
Loading Qwen model with LoRA integration...


Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.07s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.08it/s]
Device set to use cuda:0


Model and tokenizer loaded successfully.
Tokenizing queries and generating new queries...
Tokenized 9 new queries.
Saving tokenized data to 'data.json'...
Tokenized data saved successfully.
Loading tokenized data from 'data.json'...
Loaded 9 queries and 2587 documents from 'data.json'.
Encoding queries and documents...


OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 15.72 GiB of which 7.44 MiB is free. Including non-PyTorch memory, this process has 15.71 GiB memory in use. Of the allocated memory 15.54 GiB is allocated by PyTorch, and 16.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
import os
import json
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from utils import (
    load_qwen_with_lora, prepare_items, get_detailed_instruct, 
    get_new_queries, save_data
)

# Set CUDA device and configure memory management
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("CUDA device set to 4 and memory configuration adjusted.")

# Configuration
print("Setting up configuration...")
IS_SUBMISSION = True
base_model_path = "Qwen/Qwen2.5-7B-Instruct"  # Base model path
lora_path = "saved_models/lora-14b-1126"  # LoRA adapter path
query_max_len, doc_max_len = 320, 48
examples_prefix = ""
task = "Given a math multiple-choice problem with a student's wrong answer, retrieve the math misconceptions"
print("Configuration setup completed.")

# Step 1: Load datasets
print("Loading datasets...")
df_train = pd.read_csv("./data/train.csv").fillna(-1).sample(10, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("./data/test.csv")
df_misconception_mapping = pd.read_csv("./data/misconception_mapping.csv")
print("Datasets loaded successfully.")

# Step 2: Select the appropriate dataset
df_ret = df_test if IS_SUBMISSION else df_train
print(f"Dataset selected for {'submission' if IS_SUBMISSION else 'training'}.")

# Step 3: Prepare items and targets
print("Preparing items and targets...")
df_input, target_ids = prepare_items(df_ret, IS_SUBMISSION)
print(f"Prepared {len(df_input)} items and targets.")

# Step 4: Prepare queries and documents
print("Preparing queries and documents...")
queries = [get_detailed_instruct(task, q) for q in df_input['Prompt']]
documents = df_misconception_mapping['MisconceptionName'].tolist()
print(f"Prepared {len(queries)} queries and {len(documents)} documents.")

# Step 5: Load Qwen model with LoRA integration
print("Loading Qwen model with LoRA integration...")
model = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model, tokenizer = load_qwen_with_lora(base_model_path, lora_path)
print("Model and tokenizer loaded successfully.")

# Step 6: Tokenize queries and generate new queries
print("Tokenizing queries and generating new queries...")
new_query_max_len, new_queries = get_new_queries(queries, query_max_len, examples_prefix, tokenizer)
print(f"Tokenized {len(new_queries)} new queries.")

# Step 7: Save the tokenized data to a JSON file
print("Saving tokenized data to 'data.json'...")
save_data(new_queries, documents)
print("Tokenized data saved successfully.")

# Step 8: Load the tokenized data
print("Loading tokenized data from 'data.json'...")
with open("data.json", "r") as f:
    data = json.load(f)

queries = data['texts'][:len(new_queries)]
documents = data['texts'][len(new_queries):]
print(f"Loaded {len(queries)} queries and {len(documents)} documents from 'data.json'.")

# Step 9: Encode queries and documents with reduced batch size
print("Encoding queries and documents...")
embedder = pipeline("feature-extraction", model=model, tokenizer=tokenizer, framework="pt")

# Process queries and documents in smaller batches to reduce memory usage
def batch_process(items, batch_size):
    for i in range(0, len(items), batch_size):
        yield items[i:i+batch_size]

query_embeddings = []
for batch in batch_process(queries, batch_size=2):  # Adjust batch size if needed
    query_embeddings.extend(embedder(batch))

print("Queries encoded successfully.")

document_embeddings = []
for batch in batch_process(documents, batch_size=2):
    document_embeddings.extend(embedder(batch))

print("Documents encoded successfully.")

# Step 10: Calculate cosine similarity
print("Calculating cosine similarity...")
similarity_scores = cosine_similarity(query_embeddings, document_embeddings)
print("Cosine similarity calculated.")

# Step 11: Retrieve top misconceptions
print("Retrieving top misconceptions...")
top_k = 5  # Number of top misconceptions to retrieve
results = []
for i, scores in enumerate(similarity_scores):
    top_indices = np.argsort(scores)[-top_k:][::-1]
    top_docs = [documents[idx] for idx in top_indices]
    results.append({
        "Query": queries[i],
        "TopMisconceptions": top_docs
    })
print("Top misconceptions retrieved.")

# Step 12: Save results for submission
print("Saving results to 'submission.json'...")
output_file = "submission.json"
with open(output_file, "w") as f:
    json.dump(results, f, indent=4)
print(f"Results saved to {output_file}.")

CUDA device set to 4 and memory configuration adjusted.
Setting up configuration...
Configuration setup completed.
Loading datasets...
Datasets loaded successfully.
Dataset selected for submission.
Preparing items and targets...
Prepared 9 items and targets.
Preparing queries and documents...
Prepared 9 queries and 2587 documents.
Loading Qwen model with LoRA integration...


Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.04it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.09it/s]
Device set to use cuda:0


Model and tokenizer loaded successfully.
Tokenizing queries and generating new queries...
Tokenized 9 new queries.
Saving tokenized data to 'data.json'...
Tokenized data saved successfully.
Loading tokenized data from 'data.json'...
Loaded 9 queries and 2587 documents from 'data.json'.
Encoding queries and documents...
Queries encoded successfully.


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Documents encoded successfully.
Calculating cosine similarity...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (9, 1) + inhomogeneous part.