In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import re

In [3]:
def convert_features(train, test):
    options = ["A", "B", "C", "D"]
    train_dataset = pd.DataFrame(columns=["QuestionId_Answer", "Construct", "Subject", "QuestionText", "AnswerText", "IsCorrect", "ActualAnswer", "MisconceptionId"])
    for idx, item in train.iterrows():
        for opt in options:
            train_dataset.loc[len(train_dataset)] = [
                f"{item['QuestionId']}_{opt}",
                item['ConstructName'],
                item['SubjectName'],
                item['QuestionText'],
                item[f'Answer{opt}Text'],
                1 if opt == item['CorrectAnswer'] else 0,
                item[f'Answer{item["CorrectAnswer"]}Text'],
                int(item[f'Misconception{opt}Id']) if not np.isnan(item[f'Misconception{opt}Id']) else None
            ]
    
    test_dataset = pd.DataFrame(columns=["QuestionId_Answer", "Construct", "Subject", "QuestionText", "AnswerText", "IsCorrect", "ActualAnswer"])
    for idx, item in test.iterrows():
        for opt in options:
            test_dataset.loc[len(test_dataset)] = [
                f"{item['QuestionId']}_{opt}",
                item['ConstructName'],
                item['SubjectName'],
                item['QuestionText'],
                item[f'Answer{opt}Text'],
                1 if opt == item['CorrectAnswer'] else 0,
                item[f'Answer{item["CorrectAnswer"]}Text']
            ]
    return train_dataset, test_dataset


train_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
train_df, test_df = convert_features(train_df, test_df)


In [4]:
# Load datasets
misconceptions_df = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv')

In [5]:
# Display basic information
print(train_df.info())
print(test_df.info())
print(misconceptions_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 7476 entries, 0 to 7475
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   QuestionId_Answer  7476 non-null   object
 1   Construct          7476 non-null   object
 2   Subject            7476 non-null   object
 3   QuestionText       7476 non-null   object
 4   AnswerText         7476 non-null   object
 5   IsCorrect          7476 non-null   int64 
 6   ActualAnswer       7476 non-null   object
 7   MisconceptionId    4370 non-null   object
dtypes: int64(1), object(7)
memory usage: 525.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 0 to 11
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   QuestionId_Answer  12 non-null     object
 1   Construct          12 non-null     object
 2   Subject            12 non-null     object
 3   QuestionText       12 non-null     

In [6]:
# Handle missing values
text_columns = ['QuestionText', 'AnswerText']
train_df[text_columns] = train_df[text_columns].fillna('No text provided')
test_df[text_columns] = test_df[text_columns].fillna('No text provided')

misconception_columns = ['MisconceptionId']
train_df[misconception_columns] = train_df[misconception_columns].fillna('')

In [7]:
train_df

Unnamed: 0,QuestionId_Answer,Construct,Subject,QuestionText,AnswerText,IsCorrect,ActualAnswer,MisconceptionId
0,0_A,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),1,\( 3 \times(2+4)-5 \),
1,0_B,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times 2+(4-5) \),0,\( 3 \times(2+4)-5 \),
2,0_C,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4-5) \),0,\( 3 \times(2+4)-5 \),
3,0_D,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,Does not need brackets,0,\( 3 \times(2+4)-5 \),1672
4,1_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",\( m+1 \),0,Does not simplify,2142
...,...,...,...,...,...,...,...,...
7471,1867_D,Distinguish between congruency and similarity,Congruency in Other Shapes,Tom and Katie are discussing congruence and si...,Neither is correct,0,Only Katie,2312
7472,1868_A,Describe a 90° or 270° rotation giving the ang...,Rotation,Jo and Paul are arguing about how to fully des...,Only\nJo,0,Only Paul,801
7473,1868_B,Describe a 90° or 270° rotation giving the ang...,Rotation,Jo and Paul are arguing about how to fully des...,Only Paul,1,Only Paul,
7474,1868_C,Describe a 90° or 270° rotation giving the ang...,Rotation,Jo and Paul are arguing about how to fully des...,Both Jo and Paul,0,Only Paul,801


In [8]:
!pip install -q -U /kaggle/input/pylatexenc-wheel/pylatexenc-2.10-py3-none-any.whl

In [9]:
from pylatexenc.latex2text import LatexNodes2Text

def clean_text(text):
    #text = re.sub(r'[^\w\s\[\]\(\)\{\}+\-\*/=]', '', text)  # Keep mathematical symbols
    text = text.lower()
    text = LatexNodes2Text().latex_to_text(text)
    return ' '.join(text.split())

for col in text_columns:
    train_df[col] = train_df[col].apply(clean_text)
    test_df[col] = test_df[col].apply(clean_text)

In [10]:
train_df

Unnamed: 0,QuestionId_Answer,Construct,Subject,QuestionText,AnswerText,IsCorrect,ActualAnswer,MisconceptionId
0,0_A,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4)-5,1,\( 3 \times(2+4)-5 \),
1,0_B,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 × 2+(4-5),0,\( 3 \times(2+4)-5 \),
2,0_C,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4-5),0,\( 3 \times(2+4)-5 \),
3,0_D,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,does not need brackets,0,\( 3 \times(2+4)-5 \),1672
4,1_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",m+1,0,Does not simplify,2142
...,...,...,...,...,...,...,...,...
7471,1867_D,Distinguish between congruency and similarity,Congruency in Other Shapes,tom and katie are discussing congruence and si...,neither is correct,0,Only Katie,2312
7472,1868_A,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,only jo,0,Only Paul,801
7473,1868_B,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,only paul,1,Only Paul,
7474,1868_C,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,both jo and paul,0,Only Paul,801


In [11]:
# feature engineering
train_df['full_text'] = train_df['QuestionText'] + ' [SEP] ' + train_df['AnswerText']
test_df['full_text'] = test_df['QuestionText'] + ' [SEP] ' + test_df['AnswerText']

train_df['question_length'] = train_df['QuestionText'].str.len()
test_df['question_length'] = test_df['QuestionText'].str.len()

In [12]:
train_df

Unnamed: 0,QuestionId_Answer,Construct,Subject,QuestionText,AnswerText,IsCorrect,ActualAnswer,MisconceptionId,full_text,question_length
0,0_A,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4)-5,1,\( 3 \times(2+4)-5 \),,3 × 2+4-5 where do the brackets need to go to ...,72
1,0_B,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 × 2+(4-5),0,\( 3 \times(2+4)-5 \),,3 × 2+4-5 where do the brackets need to go to ...,72
2,0_C,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4-5),0,\( 3 \times(2+4)-5 \),,3 × 2+4-5 where do the brackets need to go to ...,72
3,0_D,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,does not need brackets,0,\( 3 \times(2+4)-5 \),1672,3 × 2+4-5 where do the brackets need to go to ...,72
4,1_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",m+1,0,Does not simplify,2142,"simplify the following, if possible: m^2+2 m-3...",50
...,...,...,...,...,...,...,...,...,...,...
7471,1867_D,Distinguish between congruency and similarity,Congruency in Other Shapes,tom and katie are discussing congruence and si...,neither is correct,0,Only Katie,2312,tom and katie are discussing congruence and si...,343
7472,1868_A,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,only jo,0,Only Paul,801,jo and paul are arguing about how to fully des...,477
7473,1868_B,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,only paul,1,Only Paul,,jo and paul are arguing about how to fully des...,477
7474,1868_C,Describe a 90° or 270° rotation giving the ang...,Rotation,jo and paul are arguing about how to fully des...,both jo and paul,0,Only Paul,801,jo and paul are arguing about how to fully des...,477


In [15]:
# Ensure test_df has all columns present in train_df
missing_cols = set(train_df.columns) - set(test_df.columns)
if missing_cols:
    new_cols = pd.DataFrame({col: 0 for col in missing_cols}, index=test_df.index)
    test_df = pd.concat([test_df, new_cols], axis=1)

# Align columns
test_df = test_df[train_df.columns]

In [16]:
# Handle misconception labels (only for train data)
num_misconceptions = len(misconceptions_df)

for col in misconception_columns:
    train_df[f'{col}_processed'] = train_df[col].apply(lambda x: [int(i) for i in str(x).split() if i.isdigit()] if not pd.isna(x) and x != '' else [])
    train_df[f'{col}_labels'] = train_df[f'{col}_processed'].apply(lambda misconceptions: np.array([1 if i in misconceptions else 0 for i in range(num_misconceptions)]))


In [17]:
# Prepare data for BERT model
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/model-set/bert-base-uncased')
def tokenize_and_encode(texts, max_length=512):
    return tokenizer(
        texts,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

In [18]:
#tokenizer.save_pretrained('./bert-base-uncased')

In [19]:
train_datasets = {}
val_datasets = {}
test_datasets = {}

num_misconceptions = len(misconceptions_df)

def create_multi_hot_labels(misconceptions):
    labels = np.zeros(num_misconceptions)
    if isinstance(misconceptions, (int, float)) and not np.isnan(misconceptions):
        labels[int(misconceptions)] = 1
    elif isinstance(misconceptions, list):
        labels[misconceptions] = 1
    return labels

unique_misconceptions = train_df['MisconceptionId'].unique()
misconception_to_index = {misc: idx for idx, misc in enumerate(unique_misconceptions)}

# Only for incorrect answers
train_texts = train_df['full_text'].tolist()
train_labels = np.array([create_multi_hot_labels(misc) for misc in train_df['MisconceptionId']])
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)


train_encodings = tokenize_and_encode(train_texts)
val_encodings = tokenize_and_encode(val_texts)

train_datasets = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_labels, dtype=torch.float)  # Change to float for multi-label
)

val_datasets = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    torch.tensor(val_labels, dtype=torch.float)  # Change to float for multi-label
)

test_texts = test_df['full_text'].tolist()
test_encodings = tokenize_and_encode(test_texts)
test_datasets = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)

In [20]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch
from sklearn.metrics import f1_score
import numpy as np

In [21]:
# Set up the model
num_labels = len(misconceptions_df)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained(
    '/kaggle/input/model-set/bert-base-uncased',
    num_labels=num_labels,
    problem_type="multi_label_classification"
)
#model.save_pretrained('./bert-base-uncased')

# Training parameters
epochs = 5
batch_size = 16
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Calculate total steps for all answer options
total_steps = len(train_datasets) * epochs // batch_size

# Set up the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [22]:
#model.save_pretrained('./bert-base-uncased')

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [23]:
    
train_dataloader = DataLoader(
    train_datasets,
    sampler=RandomSampler(train_datasets),
    batch_size=batch_size
)

val_dataloader = DataLoader(
    val_datasets,
    sampler=SequentialSampler(val_datasets),
    batch_size=batch_size
)

In [24]:
# Prediction
model.eval()
final_predictions = {}

test_dataloader = DataLoader(test_datasets, batch_size=batch_size, shuffle=False)

with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }
        
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.sigmoid(logits).cpu().numpy()
        
        for j, pred in enumerate(preds):
            idx = i * batch_size + j
            question_id = test_df['QuestionId_Answer'].iloc[idx].split('_')[0]
            answer = test_df['QuestionId_Answer'].iloc[idx].split('_')[1]
            
            # Get top 25 predictions
            top_25 = np.argsort(pred)[-25:][::-1]
            misconceptions = ' '.join([str(k) for k in top_25 if pred[k] > 0.5])
            if not misconceptions:
                misconceptions = '0'
            final_predictions[f"{question_id}_{answer}"] = misconceptions

# Create submission file
sample_submission = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv', index_col='QuestionId_Answer')
submission_df = sample_submission.copy()
submission_df['MisconceptionId'] = pd.Series(final_predictions)

# Ensure all required predictions are present
missing_predictions = submission_df[submission_df['MisconceptionId'].isnull()].index
if len(missing_predictions) > 0:
    print(f"Warning: Missing predictions for {len(missing_predictions)} entries. Filling with '0'.")
    submission_df.loc[missing_predictions, 'MisconceptionId'] = '0'

submission_df.to_csv('submission.csv')
print("Predictions completed and submission file created.")

Predictions completed and submission file created.


In [25]:
test_df

Unnamed: 0,QuestionId_Answer,Construct,Subject,QuestionText,AnswerText,IsCorrect,ActualAnswer,MisconceptionId,full_text,question_length
0,1869_A,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4)-5,1,\( 3 \times(2+4)-5 \),0,3 × 2+4-5 where do the brackets need to go to ...,72
1,1869_B,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 × 2+(4-5),0,\( 3 \times(2+4)-5 \),0,3 × 2+4-5 where do the brackets need to go to ...,72
2,1869_C,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,3 ×(2+4-5),0,\( 3 \times(2+4)-5 \),0,3 × 2+4-5 where do the brackets need to go to ...,72
3,1869_D,Use the order of operations to carry out calcu...,BIDMAS,3 × 2+4-5 where do the brackets need to go to ...,does not need brackets,0,\( 3 \times(2+4)-5 \),0,3 × 2+4-5 where do the brackets need to go to ...,72
4,1870_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",m+1,0,Does not simplify,0,"simplify the following, if possible: m^2+2 m-3...",50
5,1870_B,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",m+2,0,Does not simplify,0,"simplify the following, if possible: m^2+2 m-3...",50
6,1870_C,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",m-1,0,Does not simplify,0,"simplify the following, if possible: m^2+2 m-3...",50
7,1870_D,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"simplify the following, if possible: m^2+2 m-3...",does not simplify,1,Does not simplify,0,"simplify the following, if possible: m^2+2 m-3...",50
8,1871_A,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,tom and katie are discussing the 5 plants with...,only tom,0,Only\nKatie,0,tom and katie are discussing the 5 plants with...,265
9,1871_B,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,tom and katie are discussing the 5 plants with...,only katie,1,Only\nKatie,0,tom and katie are discussing the 5 plants with...,265
