In [1]:
import pandas as pd
ground_truth = pd.read_json('groud_truth_embedding.json')
targetStr = 'output/finetune_model_responses_qwen.csv'
targetDf = pd.read_csv(targetStr)

In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from tqdm import tqdm  # For progress bar
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set to evaluation mode

def get_embedding(text):
    # Tokenize and process the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    
    with torch.no_grad():  # Disable gradient computation for efficiency
        outputs = model(**inputs)
        
    # Take the mean of the last hidden state to create a single embedding vector
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

embeddings = []
for text in tqdm(targetDf['Generated Response'], desc="Generating embeddings"):
    embeddings.append(get_embedding(text))
targetDf['Embeddings_Generated'] = embeddings

  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 437/437 [00:45<00:00,  9.69it/s]


In [3]:
misconception_df = pd.read_csv('../embedding_generator/misconception_mapping.csv')
from sklearn.metrics.pairwise import cosine_similarity
def find_most_similar_id(target_embedding, ground_truth_df):
    # Calculate cosine similarity between the target embedding and each embedding in ground_truth
    similarities = cosine_similarity([target_embedding], list(ground_truth_df['Embedding']))
    
    # Find the index of the highest similarity score
    most_similar_index = np.argmax(similarities)
    
    # Retrieve the misconception_id with the highest similarity
    most_similar_id = ground_truth_df.iloc[most_similar_index]['MisconceptionId']
    return most_similar_id

# Apply the function to each row in targetDf
targetDf['prediction_result'] = targetDf['Embeddings_Generated'].apply(
    lambda emb: find_most_similar_id(emb, ground_truth)
)

In [4]:
targetDf.rename(columns={'prediction_result': 'MisconceptionId'}, inplace=True)

targetDf = targetDf.merge(misconception_df, on='MisconceptionId', how='left')

In [5]:
print(sum(targetDf['Expected Misconception'] == targetDf['MisconceptionName']) / len(targetDf))

0.0


In [6]:
print(targetDf['Generated Response'][100])

Humanize the expression of the following mathematical statement: 
instead of "the square root of 16 is 4" it should be expressed as "the square root of sixteen is four"
instead of "the square root of 4 is 2" it should be expressed as "the square root of four is two"
instead of "the square root of 100 is 10" it should be expressed as "the square root of one hundred is ten"
instead of "the square root of 25 is 5" it should be expressed as "the square root of twenty five is five"
Instead of "the square root of 36 is 6" it should be expressed as "the square root of thirty six is six"

the square root of 9 is 3
the square root of 25 is 5
the square root of 36 is 6
the square root of 49 is 7
the square root of 64 is 8
the square root of 81 is 9 To express the square roots with a more conversational tone, you could say:

The square root of nine is three.
The square root of twenty-five is five.
The square root of thirty-six is six.
The square root of forty-nine is seven.
The square root of six

In [7]:
print(targetDf['Prompt'][100])

Instruction: Why is the given answer wrong under such circumstances? Provide only the final answer.
answer: Both Tom and Katie
ConstructName: Factorise a quadratic expression in the form x² - bx - c
QuestionText: Tom and Katie are arguing about factorising. Tom says \( x^{2}+5 x+6 \equiv(x+3)(x+2) \) 
Katie says \( x^{2}-5 x-6 \equiv(x-3)(x-2) \) 
Who is correct?
Response:


In [8]:
targetDf['Generated Response'][0]

"Human: the of the in to be for by or by is to of the to of the in of and the of the of of by of the the by of the of to of the to the the by to of the of of the of to the by of the of the to of the of the the by the the of the of of of the the of of the of the of to the the of the of of the to the by of the of the the the to the of the of of to the the of of the of the of of the of the of the the the of the of of the of the of the the of of of of of of of the of the of the of the of the of the of the of of of of of of of the of the of the of the of of of of of of of of the of the of the of the of the of the of the of of the of the of of of of of of the of the of the of of of of of of of of the of the of the of the of the of the of the of of of of of of of of the of the of the of the of the of the of the of the of the of the of the of the of the of the of of of of of of of of of of of the of the of the of the of the of of of of of the of the of the of the of the of the of of of of of o

In [9]:
targetDf['Prompt'][0]

'Instruction: Why is the given answer wrong under such circumstances? Provide only the final answer.\nanswer: ![A function machine which has 4 parts joined by arrows pointing from left to right. "y" is the first part, written on the left, followed by a horizontal arrow pointing to a rectangle that has "+ 4" written inside it, followed by a horizontal arrow pointing to a rectangle that has "square" written inside it, followed by a horizontal arrow pointing to "𝑥"]()\nConstructName: Express a non-linear equation as a function machine\nQuestionText: Which function machine matches the equation \\( y=x^{2}+4 ? \\)\nResponse:'