In [22]:
import pandas as pd
ground_truth = pd.read_json('groud_truth_embedding.json')
targetStr = 'output/few_shot_model_responses_qwen.csv'
targetDf = pd.read_csv(targetStr)

In [24]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from tqdm import tqdm  # For progress bar
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set to evaluation mode
def get_embedding(text):
    # Tokenize and process the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**inputs)
        
    # Take the mean of the last hidden state to create a single embedding vector
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings
embeddings = []
for text in tqdm(targetDf['Generated Response'], desc="Generating embeddings"):
    embeddings.append(get_embedding(text))
targetDf['Embeddings_Generated'] = embeddings

Generating embeddings: 100%|██████████| 437/437 [00:42<00:00, 10.33it/s]


In [25]:
misconception_df = pd.read_csv('../embedding_generator/misconception_mapping.csv')
from sklearn.metrics.pairwise import cosine_similarity
def find_most_similar_id(target_embedding, ground_truth_df):
    # Calculate cosine similarity between the target embedding and each embedding in ground_truth
    similarities = cosine_similarity([target_embedding], list(ground_truth_df['Embedding']))
    
    # Find the index of the highest similarity score
    most_similar_index = np.argmax(similarities)
    
    # Retrieve the misconception_id with the highest similarity
    most_similar_id = ground_truth_df.iloc[most_similar_index]['MisconceptionId']
    return most_similar_id

# Apply the function to each row in targetDf
targetDf['prediction_result'] = targetDf['Embeddings_Generated'].apply(
    lambda emb: find_most_similar_id(emb, ground_truth)
)

In [26]:
targetDf.rename(columns={'prediction_result': 'MisconceptionId'}, inplace=True)

targetDf = targetDf.merge(misconception_df, on='MisconceptionId', how='left')

In [27]:
print(sum(targetDf['Expected Misconception'] == targetDf['MisconceptionName']) / len(targetDf))

0.004576659038901602


In [28]:
print(targetDf['Generated Response'][100])

Both Tom and Katie are incorrect. The correct factorizations are:
- For \( x^2 + 5x + 6 \), the correct factorization is \( (x+2)(x+3) \).
- For \( x^2 - 5x - 6 \), the correct factorization is \( (x-6)(x+1) \).


In [29]:
print(targetDf['Prompt'][100])

Instruction: Why is the given answer wrong under such circumstances? Some of the examples are given below
Example from before: 
Example2731, Question :\( 24 \) people went to an local ice hockey match.
The pie chart shows the colours of their shirts.
How many people wore red? ![Pie chart divided into 8 equal sections. 4 sections are yellow, 2 sections are red, 1 section is black and 1 section is white.]()
Answer: \( 3 \)
Example567, Question :This is a part of the table of values for the equation
\[y=3 x^{2}\] \begin{tabular}{|l|l|}
\hline\( x \) & \( 0.3 \) \\
\hline\( y \) & \( \bigstar \) \\
\hline
\end{tabular} What should replace the star?
Answer: \( 0.81 \)
Example3546, Question :Tom and Katie are discussing regular polygons. Tom says this is a regular polygon ![A triangle with 3 equal sides and 3 equal angles]() Katie says this is a regular polygon ![A parallelogram with 2 pairs of parallel sides marked]() Who is correct?
Answer: Neither is correct
Example3892, Question :Simplif

In [30]:
targetDf['Generated Response'][0]

"Example2832, Answer: Both Tom and Katie\n\nThe given answer is incorrect because Tom's proposed next step, \\( \\frac{3x}{2} = 0.6 \\), is indeed a valid transformation from the original equation \\( \\frac{3x}{2} + 1 = 1.6 \\). However, Katie's proposed next step, \\( 3x + 2 = 2.12 \\), is incorrect. The correct next step should isolate the term with \\( x \\) on one side of the equation, and the correct form after subtracting 1 from both sides would be \\( \\frac{3x}{2} = 0.6 \\), not \\( 3x + 2 = 2.12 \\)."

In [31]:
targetDf['Prompt'][0]

'Instruction: Why is the given answer wrong under such circumstances? Some of the examples are given below\nExample from before: \nExample958, Question :Factorise this expression, if possible:\r\n\\(\r\np^{2}-9\r\n\\)\nAnswer: \\( (p-3)(p-3) \\)\nExample917, Question :I am facing East. \\( \\mathrm{E} \\longrightarrow \\) How many degrees anti-clockwise will I need to turn so I am facing South?\nAnswer: \\( 90^{\\circ} \\)\nExample129, Question :These two lines are ... ![Two lines on a graph meeting at a right angle]()\nAnswer: parallelogram\nExample234, Question :Which angle is corresponding to angle \\( p \\) ? ![Image showing two parallel lines cut by a transversal creating two distinct angles around a point. On the left hand side the pink angle is labelled with A, vertically opposite this is the angle labelled p. On the right hand side, angle B is co-interior to p. Then lying on the same straight line as B is the angle labelled C. Vertically opposite angle C is the angle labelled D