In [1]:
# Install dependencies
!pip install transformers tqdm

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [9]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW
from tqdm.notebook import tqdm
import numpy as np

# Define a custom Dataset class
class MisconceptionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(
            texts.tolist() if isinstance(texts, pd.Series) else texts,
            truncation=True,
            padding='max_length',
            max_length=max_len,
            return_attention_mask=True,
            return_tensors='pt'
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


def prepare_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Prepare the text data
    X = df[['answer', 'ConstructName', 'QuestionText']].astype(str)
    X = "answer: " + X['answer'] + " " + "ConstructName: " + X['ConstructName'] + " " + "QuestionText: " + X['QuestionText']

    # Use MisconceptionId as labels
    Y = df['MisconceptionId'].astype(int)

    # Encode the labels to start from 0
    label_encoder = LabelEncoder()
    Y = label_encoder.fit_transform(Y)
    num_labels = len(label_encoder.classes_)

    # Split the data into training and temp (temp will be split into validation and test)
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)  # 80% train, 20% temp

    # Split temp into validation and test sets
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)  # 10% val, 10% test

    return X_train, Y_train, X_val, Y_val, X_test, Y_test, label_encoder, num_labels


def create_dataloaders(train_x, train_y, val_x, val_y, test_x, test_y, tokenizer, batch_size=32):
    # Prepare datasets
    train_dataset = MisconceptionDataset(train_x, train_y, tokenizer)
    val_dataset = MisconceptionDataset(val_x, val_y, tokenizer)
    test_dataset = MisconceptionDataset(test_x, test_y, tokenizer)

    # Create dataloaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_dataloader, val_dataloader, test_dataloader

# Train and evaluate
# Add the rest of the script here and make sure output files (e.g., model checkpoints) are saved to Google Drive

def train_and_evaluate(train_dataloader, val_dataloader, model, device, epochs=10, learning_rate=2e-5):
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    best_loss = float('inf')
    model.to(device)

    num_training_steps = epochs * len(train_dataloader)
    with tqdm(total=num_training_steps, desc="Fine-tuning") as pbar:
        for epoch in range(epochs):
            # Training phase
            model.train()
            train_loss = 0

            for batch in train_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                optimizer.zero_grad()
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                loss = loss_fn(logits, labels)
                loss.backward()
                optimizer.step()

                train_loss += loss.item()
                pbar.update(1)

            avg_train_loss = train_loss / len(train_dataloader)
            print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

            # Validation phase
            model.eval()
            val_loss = 0
            correct_predictions = 0
            total_predictions = 0

            with torch.no_grad():
                for batch in val_dataloader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    loss = loss_fn(logits, labels)
                    val_loss += loss.item()
                    _, preds = torch.max(logits, dim=1)
                    correct_predictions += torch.sum(preds == labels)
                    total_predictions += labels.size(0)

            avg_val_loss = val_loss / len(val_dataloader)
            val_accuracy = correct_predictions.double() / total_predictions
            print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

            # Save the best model checkpoint
            if avg_val_loss < best_loss:
                best_loss = avg_val_loss
                best_model_state_dict = model.state_dict()
                torch.save(best_model_state_dict, 'best_model_checkpoint.pth')
                print(f"Best model saved with Validation Loss: {avg_val_loss:.4f}")

def save_embeddings(dataloader, model, device, output_file="fine_tuned_embeddings.pt"):
    model.eval()
    embeddings_list = []
    labels_list = []

    # Determine the encoder attribute dynamically
    if hasattr(model, 'bert'):
        encoder = model.bert
    elif hasattr(model, 'roberta'):
        encoder = model.roberta
    elif hasattr(model, 'xlnet'):
        encoder = model.xlnet
    else:
        raise AttributeError("Model does not have a recognized encoder attribute.")

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Saving Embeddings"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].cpu().numpy()

            # Get the encoder outputs
            outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)
            # Depending on the model, the attribute might be 'last_hidden_state' or different
            if hasattr(outputs, 'last_hidden_state'):
                last_hidden_state = outputs.last_hidden_state
            elif hasattr(outputs, 'hidden_states'):
                last_hidden_state = outputs.hidden_states[-1]
            else:
                raise AttributeError("Encoder output does not have 'last_hidden_state' or 'hidden_states'.")

            cls_embeddings = last_hidden_state[:, 0, :]
            embeddings_list.append(cls_embeddings.cpu())
            labels_list.extend(labels)

    embeddings = torch.cat(embeddings_list, dim=0)
    labels = np.array(labels_list)
    torch.save({'embeddings': embeddings, 'labels': labels}, output_file)
    print(f"Embeddings and labels saved to {output_file}")


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
file_path = '/content/drive/MyDrive/finalDataSet.csv'
X_train, Y_train, X_val, Y_val, X_test, Y_test, label_encoder, num_labels = prepare_data(file_path)
model_name = 'tbs17/MathBERT'  # Replace with the exact model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


train_dataloader, val_dataloader, test_dataloader = create_dataloaders(
    X_train, Y_train, X_val, Y_val, X_test, Y_test, tokenizer, batch_size=32
)

train_and_evaluate(train_dataloader, val_dataloader, model, device, epochs=10, learning_rate=2e-5)
save_embeddings(test_dataloader, model, device, output_file="fine_tuned_test_embeddings.pt")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at tbs17/MathBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuning:   0%|          | 0/1100 [00:00<?, ?it/s]

Epoch 1/10, Train Loss: 7.2886
Epoch 1/10, Validation Loss: 7.1638, Validation Accuracy: 0.0297
Best model saved with Validation Loss: 7.1638
Epoch 2/10, Train Loss: 6.9070
Epoch 2/10, Validation Loss: 6.9138, Validation Accuracy: 0.0503
Best model saved with Validation Loss: 6.9138
Epoch 3/10, Train Loss: 6.5089
Epoch 3/10, Validation Loss: 6.6293, Validation Accuracy: 0.1076
Best model saved with Validation Loss: 6.6293
Epoch 4/10, Train Loss: 6.1036
Epoch 4/10, Validation Loss: 6.3748, Validation Accuracy: 0.1442
Best model saved with Validation Loss: 6.3748
Epoch 5/10, Train Loss: 5.7562
Epoch 5/10, Validation Loss: 6.1489, Validation Accuracy: 0.1739
Best model saved with Validation Loss: 6.1489
Epoch 6/10, Train Loss: 5.4340
Epoch 6/10, Validation Loss: 6.0001, Validation Accuracy: 0.1854
Best model saved with Validation Loss: 6.0001
Epoch 7/10, Train Loss: 5.1436
Epoch 7/10, Validation Loss: 5.8004, Validation Accuracy: 0.2105
Best model saved with Validation Loss: 5.8004
Epoch 

Saving Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

Embeddings and labels saved to fine_tuned_test_embeddings.pt


In [11]:
import os
print("Current Working Directory:", os.getcwd())

Current Working Directory: /content


In [13]:
from google.colab import files
files.download('fine_tuned_test_embeddings.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
def save_embeddings(dataloader, model, device, output_file="fine_tuned_embeddings.pt"):
    model.eval()
    embeddings_list = []
    labels_list = []

    # Determine the encoder attribute dynamically
    if hasattr(model, 'bert'):
        encoder = model.bert
    elif hasattr(model, 'roberta'):
        encoder = model.roberta
    elif hasattr(model, 'xlnet'):
        encoder = model.xlnet
    else:
        raise AttributeError("Model does not have a recognized encoder attribute.")

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Saving Embeddings"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)  # Move labels to the same device as input_ids and attention_mask

            # Get the encoder outputs
            outputs = encoder(input_ids=input_ids, attention_mask=attention_mask)
            # Depending on the model, the attribute might be 'last_hidden_state' or different
            if hasattr(outputs, 'last_hidden_state'):
                last_hidden_state = outputs.last_hidden_state
            elif hasattr(outputs, 'hidden_states'):
                last_hidden_state = outputs.hidden_states[-1]
            else:
                raise AttributeError("Encoder output does not have 'last_hidden_state' or 'hidden_states'.")

            cls_embeddings = last_hidden_state[:, 0, :]  # CLS token embeddings
            embeddings_list.append(cls_embeddings.cpu())
            labels_list.extend(labels.cpu().numpy())  # Move labels back to CPU for storage

    embeddings = torch.cat(embeddings_list, dim=0)
    labels = np.array(labels_list)
    torch.save({'embeddings': embeddings, 'labels': labels}, output_file)
    print(f"Embeddings and labels saved to {output_file}")

model.to(device)
save_embeddings(train_dataloader, model, device, output_file="fine_tuned_train_embeddings.pt")
save_embeddings(val_dataloader, model, device, output_file="fine_tuned_val_embeddings.pt")

Saving Embeddings:   0%|          | 0/110 [00:00<?, ?it/s]

Embeddings and labels saved to fine_tuned_train_embeddings.pt


Saving Embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

Embeddings and labels saved to fine_tuned_val_embeddings.pt


In [22]:
files.download('fine_tuned_train_embeddings.pt')
files.download('fine_tuned_val_embeddings.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>