In [4]:
import pandas as pd
import json
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# Flatten subtasks_with_tools into a text string
def subtasks_tools_to_text(subtasks_with_tools):
    parts = []
    for subtask in subtasks_with_tools:
        task_name = subtask.get("task", "")
        tool = subtask.get("tool", "")
        details = subtask.get("details", "")
        part = f"{task_name} {tool} {details}".strip()
        parts.append(part)
    return " | ".join(parts)


In [5]:
class BERTVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', device=None, max_length=128):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        self.max_length = max_length

    def transform(self, X, y=None):
        embeddings = []
        with torch.no_grad():
            for text in X:
                encoded = self.tokenizer(
                    text, padding='max_length', truncation=True, max_length=self.max_length,
                    return_tensors='pt'
                ).to(self.device)
                output = self.model(**encoded)
                # Use [CLS] token embedding
                cls_emb = output.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
                embeddings.append(cls_emb)
        return np.array(embeddings)

    def fit(self, X, y=None):
        return self


In [6]:
# Load structured training JSON
with open("parsed_dataset_with_tools.json", "r") as f:
    train_json = json.load(f)

train_df = pd.DataFrame(train_json)

# Prepare text features
train_df["subtasks_text"] = train_df["subtasks_with_tools"].apply(subtasks_tools_to_text)
train_df["answers_text"] = train_df["answer_options"].apply(lambda x: " ".join(x))

# Combine all text features into a single string
train_df["combined_text"] = (
    train_df["topic"] + " " +
    train_df["original_problem"] + " " +
    train_df["subtasks_text"] + " " +
    train_df["answers_text"]
)

X_train = train_df["combined_text"].values
y_train = train_df["correct_option_number"].values

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)


In [7]:
# Create BERT embeddings
bert_vectorizer = BERTVectorizer()

# Transform training texts
X_train_emb = bert_vectorizer.transform(X_train)

# Use Logistic Regression on top
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_emb, y_train_enc)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
# Load test CSV
test_df = pd.read_csv("test.csv")

from problem_parser import ProblemParser
from problem_decomposer import ProblemDecomposer
from tool_selector import ToolAssigner

AVAILABLE_TOOLS = ["calculator", "symbolic_solver", "LLM_reasoning"]

test_json = []
for idx, row in test_df.iterrows():
    topic = row["topic"]
    problem_statement = row["problem_statement"]
    answer_options = [row.get(f"answer_option_{i}", "") for i in range(1,6)]

    # Parse problem
    parser = ProblemParser(topic, problem_statement)
    parsed_problem = parser.parse()

    # Decompose
    decomposer = ProblemDecomposer(parsed_problem)
    subtasks = decomposer.decompose()

    # Assign tools
    tool_assigner = ToolAssigner(topic, parsed_problem, subtasks)
    subtasks_with_tools = tool_assigner.assign_tools()

    subtasks_text = subtasks_tools_to_text(subtasks_with_tools)
    answers_text = " ".join(answer_options)

    test_json.append({
        "topic": topic,
        "original_problem": problem_statement,
        "answer_options": answer_options,
        "subtasks_with_tools": subtasks_with_tools,
        "subtasks_text": subtasks_text,
        "answers_text": answers_text,
        "combined_text": topic + " " + problem_statement + " " + subtasks_text + " " + answers_text
    })

test_df_structured = pd.DataFrame(test_json)


In [1]:
# BERT embeddings for test set
X_test_emb = bert_vectorizer.transform(test_df_structured["combined_text"].values)

# Predict option numbers
y_pred_enc = clf.predict(X_test_emb)
y_pred_numbers = le.inverse_transform(y_pred_enc)

# Map prediction to actual answer text
solutions = []
for idx, row in test_df_structured.iterrows():
    option_idx = y_pred_numbers[idx] - 1
    if 0 <= option_idx < len(row["answer_options"]):
        solutions.append(row["answer_options"][option_idx])
    else:
        solutions.append("Invalid prediction")

# Add to DataFrame
test_df_structured["solution"] = solutions
test_df_structured["correct_option"] = y_pred_numbers

# Save final result
result_df = test_df_structured[["topic", "original_problem", "solution", "correct_option"]]
result_df.to_csv("output.csv", index=False)


NameError: name 'bert_vectorizer' is not defined