<a href="https://colab.research.google.com/github/jimzijun/Eedi---Mining-Misconceptions-in-Mathematics/blob/main/exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install kaggle scikit-learn datasets transformers torch accelerate

In [2]:
# !kaggle competitions download -c eedi-mining-misconceptions-in-mathematics -p ./datasets/eedi
# !unzip ./datasets/eedi/eedi-mining-misconceptions-in-mathematics.zip -d ./datasets/eedi

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch

# Check if a GPU is available and move model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocressing

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Read the CSV files
dataset_root_path = '/content/drive/MyDrive/datasets/eedi'

df = pd.read_csv(f'{dataset_root_path}/train.csv')
misconception_df = pd.read_csv(f'{dataset_root_path}/misconception_mapping.csv')
sample_submission_df = pd.read_csv(f'{dataset_root_path}/sample_submission.csv')

# Sample and split the train data
train_df = df.sample(n=200, random_state=42)
train_df, test_df = train_test_split(train_df, test_size=0.1, random_state=42)
test_df, eval_df = train_test_split(test_df, test_size=0.5, random_state=42)

def preprocess_dataframe(df, misconception_df=None):
    # List of options
    options = ['A', 'B', 'C', 'D']

    # List of columns to keep
    id_vars = ['QuestionId', 'QuestionText', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer']

    # Initialize an empty list to collect data
    data_list = []

    # Loop over each option to collect data
    for option in options:
        answer_col = f'Answer{option}Text'
        misconception_col = f'Misconception{option}Id'

        # Check if the misconception column exists
        if misconception_col in df.columns:
            temp_df = df[id_vars + [misconception_col, answer_col]].copy()
            temp_df.rename(columns={
                misconception_col: 'MisconceptionId',
                answer_col: 'AnswerText'
            }, inplace=True)
        else:
            # Only include the answer column if misconception column doesn't exist
            temp_df = df[id_vars + [answer_col]].copy()
            temp_df['MisconceptionId'] = None  # Assign None to MisconceptionId
            temp_df.rename(columns={
                answer_col: 'AnswerText'
            }, inplace=True)

        temp_df['Option'] = option
        data_list.append(temp_df)

    # Concatenate all the data into a single DataFrame
    df_combined = pd.concat(data_list, ignore_index=True)

    # Exclude the rows where the option matches the correct answer
    df_combined = df_combined[df_combined['Option'] != df_combined['CorrectAnswer']]

    df_combined = df_combined.merge(misconception_df, on='MisconceptionId', how='left')

    # Rename 'MisconceptionName' to 'misconceptionText'
    df_combined = df_combined.rename(columns={'MisconceptionName': 'MisconceptionText'})

    # Drop rows with missing 'misconceptionText' (only for training data)
    if 'MisconceptionText' in df_combined.columns:
        df_combined = df_combined.dropna(subset=['MisconceptionText'])

    return df_combined

# Dataset

In [6]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW
from torch.nn import MSELoss
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [7]:
class DiagnosticQuestionDataset(Dataset):
    def __init__(self, dataframe):
        # Store the dataframe
        self.data = dataframe
        # Load pre-trained BERT tokenizer and model for embedding
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.bert_model.eval()

    def __len__(self):
        # Return the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        # Extract the row at the given index
        row = self.data.iloc[idx]

        # Extract the relevant features for the model input
        construct = row['ConstructName']
        subject = row['SubjectName']
        question = row['QuestionText']
        answer = row['AnswerText']
        misconception = row['MisconceptionText']

        # Concatenate the sequences with [CLS] and [SEP] tokens
        input_text = f"[CLS] {construct} [SEP] {subject} [SEP] {question} [SEP] {answer} [SEP]"

        # Tokenize the misconception text and get BERT embeddings
        inputs = self.tokenizer(misconception, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
            misconception_embedding = outputs.last_hidden_state[:, 0, :].squeeze()  # Use [CLS] token representation

        return {
            'input_text': input_text,
            'misconception_embedding': misconception_embedding,
            'misconception_text': misconception
        }

def load_data(dataframe, batch_size=4, shuffle=True):
    # Create the dataset
    dataset = DiagnosticQuestionDataset(dataframe)

    # Create the dataloader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

# Example usage
train_processed_df = preprocess_dataframe(train_df, misconception_df)
eval_processed_df = preprocess_dataframe(eval_df, misconception_df)
test_processed_df = preprocess_dataframe(test_df, misconception_df)

dataloader = load_data(train_processed_df)
test_dataloader = load_data(test_processed_df, batch_size=1, shuffle=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Load pre-trained BERT tokenizer and model for regression
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')  # Using BERT for regression

# Add a linear layer for regression
regression_head = torch.nn.Linear(model.config.hidden_size, model.config.hidden_size)
model.to('cuda' if torch.cuda.is_available() else 'cpu')
regression_head.to('cuda' if torch.cuda.is_available() else 'cpu')

# Training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = AdamW(list(model.parameters()) + list(regression_head.parameters()), lr=2e-5)
epochs = 3
criterion = MSELoss()

# Store training misconception embeddings and corresponding texts for lookup
training_misconception_dict = {}

# Training loop
model.train()
regression_head.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(dataloader, desc=f"Training Epoch {epoch + 1}"):
        # Tokenize the input text
        inputs = tokenizer(batch['input_text'], padding=True, truncation=True, return_tensors='pt')

        # Move input tensors to the appropriate device
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        target_embedding = batch['misconception_embedding'].to(device)

        # Zero out gradients
        optimizer.zero_grad()

        # Forward pass through BERT
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_embedding = regression_head(outputs.last_hidden_state[:, 0, :])

        # Compute loss using mean squared error
        loss = criterion(predicted_embedding, target_embedding)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Track the loss
        epoch_loss += loss.item()

        # Store training misconception embeddings and texts in a dictionary (latest embedding overwrites previous one)
        misconception_texts = batch['misconception_text']
        for misconception_text, target_emb in zip(misconception_texts, target_embedding):
            training_misconception_dict[misconception_text] = target_emb.cpu().numpy()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch + 1} Loss: {avg_loss}")

# Convert training embeddings dictionary to numpy array and list of texts
training_misconception_texts = list(training_misconception_dict.keys())
training_misconception_embeddings = np.vstack(list(training_misconception_dict.values()))


Training Epoch 1: 100%|██████████| 105/105 [01:08<00:00,  1.54it/s]


Epoch 1 Loss: 0.125529942519608


Training Epoch 2: 100%|██████████| 105/105 [01:05<00:00,  1.61it/s]


Epoch 2 Loss: 0.06407153368705795


Training Epoch 3: 100%|██████████| 105/105 [01:04<00:00,  1.64it/s]

Epoch 3 Loss: 0.06155384204217366





In [9]:
training_misconception_texts = list(training_misconception_dict.keys())
training_misconception_embeddings = np.vstack(list(training_misconception_dict.values()))

In [10]:
regression_head.eval()


Linear(in_features=768, out_features=768, bias=True)

In [11]:
len(test_dataloader)

24

In [12]:
for i, batch in enumerate(test_dataloader):
    if i >= 10:
        break

    print(f"========== Batch {i + 1} ==========")

    input_text = batch['input_text'][0]
    label = batch['misconception_text'][0]

    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True).to(device)

    with torch.no_grad():
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        predicted_embedding = regression_head(outputs.last_hidden_state[:, 0, :]).cpu().numpy()

    # Calculate cosine similarity between the predicted embedding and all training misconception embeddings
    similarities = cosine_similarity(predicted_embedding.reshape(1, -1), training_misconception_embeddings)
    top_10_indices = np.argsort(similarities[0])[-10:][::-1]

    # Check if label is in training misconception texts
    if label in training_misconception_texts:
        print("||||| Does exist |||||")
    else:
        print("||||| Does not exist |||||")

    # Retrieve the top 10 closest misconception texts from the training set
    closest_misconception_texts = [training_misconception_texts[idx] for idx in top_10_indices]

    print("Input Text:", input_text)
    print("Label:", label)
    print("Top 10 Closest Misconception Texts:")
    for j, text in enumerate(closest_misconception_texts, 1):
        print(f"{j}: {text}")

||||| Does not exist |||||
Input Text: [CLS] Express one quantity as a fraction of another [SEP] Simplifying Fractions [SEP] Write \( 16 \) over \( 28 \) as a fraction in its simplest terms. [SEP] \( \frac{16}{28} \) [SEP]
Label: Forgot to simplify the fraction
Top 10 Closest Misconception Texts:
1: Thinks a negative x value in a column vector means to move right rather than left 
2: Thinks a positive x value in a column vector means to move left rather than right
3: Finds an equivalent improper fraction when asked for a mixed number
4: Multiplies by the denominator instead of dividing when finding a fraction of an amount
5: Thinks the shape of the graph does not affect the area calculation
6: Thinks you can divide terms by different factors when simplifying an algebraic fraction
7: Thinks a positive y value in a column vector means to move down rather than up 
8: Does not think a square root can be negative
9: Does not interpret the correct order of operations from a worded problem
10