In [12]:
import pandas as pd
ground_truth = pd.read_json('groud_truth_embedding.json')
targetStr = 'output/zero_shot_model_responses_qwen.csv'
targetDf = pd.read_csv(targetStr)

In [13]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from tqdm import tqdm  # For progress bar

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set to evaluation mode
def get_embedding(text):
    # Tokenize and process the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**inputs)
        
    # Take the mean of the last hidden state to create a single embedding vector
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings
embeddings = []
for text in tqdm(targetDf['Generated Response'], desc="Generating embeddings"):
    embeddings.append(get_embedding(text))
targetDf['Embeddings_Generated'] = embeddings

Generating embeddings: 100%|██████████| 437/437 [00:40<00:00, 10.90it/s]


In [14]:
misconception_df = pd.read_csv('../embedding_generator/misconception_mapping.csv')
from sklearn.metrics.pairwise import cosine_similarity
def find_most_similar_id(target_embedding, ground_truth_df):
    # Calculate cosine similarity between the target embedding and each embedding in ground_truth
    similarities = cosine_similarity([target_embedding], list(ground_truth_df['Embedding']))
    
    # Find the index of the highest similarity score
    most_similar_index = np.argmax(similarities)
    
    # Retrieve the misconception_id with the highest similarity
    most_similar_id = ground_truth_df.iloc[most_similar_index]['MisconceptionId']
    return most_similar_id

# Apply the function to each row in targetDf
targetDf['prediction_result'] = targetDf['Embeddings_Generated'].apply(
    lambda emb: find_most_similar_id(emb, ground_truth)
)

In [15]:
targetDf.rename(columns={'prediction_result': 'MisconceptionId'}, inplace=True)

targetDf = targetDf.merge(misconception_df, on='MisconceptionId', how='left')

In [16]:
print(sum(targetDf['Expected Misconception'] == targetDf['MisconceptionName']) / len(targetDf))

0.002288329519450801


In [17]:
print(targetDf['Generated Response'][100])

Both Tom and Katie are incorrect.


In [18]:
print(targetDf['Prompt'][100])

Instruction: Why is the given answer wrong under such circumstances?
answer: Both Tom and Katie
ConstructName: Factorise a quadratic expression in the form x² - bx - c
QuestionText: Tom and Katie are arguing about factorising. Tom says \( x^{2}+5 x+6 \equiv(x+3)(x+2) \) 
Katie says \( x^{2}-5 x-6 \equiv(x-3)(x-2) \) 
Who is correct?


In [19]:
targetDf['Generated Response'][0]

'The given answer is incorrect because it does not accurately represent the function \\( y = x^2 + 4 \\). The provided image describes a function machine where the input \\( x \\) goes through a sequence of operations (addition and squaring), but it does not include the addition of 4 at any point, which is necessary for the equation \\( y = x^2 + 4 \\).'

In [20]:
targetDf['Prompt'][0]

'Instruction: Why is the given answer wrong under such circumstances?\nanswer: ![A function machine which has 4 parts joined by arrows pointing from left to right. "y" is the first part, written on the left, followed by a horizontal arrow pointing to a rectangle that has "+ 4" written inside it, followed by a horizontal arrow pointing to a rectangle that has "square" written inside it, followed by a horizontal arrow pointing to "𝑥"]()\nConstructName: Express a non-linear equation as a function machine\nQuestionText: Which function machine matches the equation \\( y=x^{2}+4 ? \\)'