In [1]:
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Flatten subtasks_with_tools into a text string
def subtasks_tools_to_text(subtasks_with_tools):
    parts = []
    for subtask in subtasks_with_tools:
        task_name = subtask.get("task", "")
        tool = subtask.get("tool", "")
        details = subtask.get("details", "")
        part = f"{task_name} {tool} {details}".strip()
        parts.append(part)
    return " | ".join(parts)


In [2]:
class BERTVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', device=None, max_length=128):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        self.max_length = max_length

    def transform(self, X, y=None):
        embeddings = []
        with torch.no_grad():
            for text in X:
                encoded = self.tokenizer(
                    text, padding='max_length', truncation=True, max_length=self.max_length,
                    return_tensors='pt'
                ).to(self.device)
                output = self.model(**encoded)
                cls_emb = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
                embeddings.append(cls_emb)
        return np.array(embeddings)

    def fit(self, X, y=None):
        return self


In [3]:
# Load structured training JSON
with open("parsed_dataset_with_tools.json", "r") as f:
    data_json = json.load(f)

df = pd.DataFrame(data_json)

# Flatten subtasks & answers
df["subtasks_text"] = df["subtasks_with_tools"].apply(subtasks_tools_to_text)
df["answers_text"] = df["answer_options"].apply(lambda x: " ".join(x))

# Combine all text features
df["combined_text"] = (
    df["topic"] + " " +
    df["original_problem"] + " " +
    df["subtasks_text"] + " " +
    df["answers_text"]
)

X = df["combined_text"].values
y = df["correct_option_number"].values

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)


In [4]:
# Split dataset into 95% train, 5% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.05, random_state=42, stratify=y_enc
)
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")


Training samples: 364, Test samples: 20


In [5]:
bert_vectorizer = BERTVectorizer()

# Transform train and test text
X_train_emb = bert_vectorizer.transform(X_train)
X_test_emb = bert_vectorizer.transform(X_test)


In [6]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_emb, y_train)


In [7]:
# Predict on test set
y_pred = clf.predict(X_test_emb)

# Accuracy and F1
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[str(i) for i in le.classes_]))


Test Accuracy: 0.3500
Test F1 Score: 0.3465

Classification Report:
              precision    recall  f1-score   support

           1       0.50      0.40      0.44         5
           2       0.25      0.20      0.22         5
           3       0.33      0.25      0.29         4
           4       0.50      0.50      0.50         2
           5       0.29      0.50      0.36         4

    accuracy                           0.35        20
   macro avg       0.37      0.37      0.36        20
weighted avg       0.36      0.35      0.35        20

