In [25]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from google.colab import drive
from sklearn.model_selection import train_test_split

drive.mount('/content/drive')
file_path1 = '/content/drive/MyDrive/mathbert_embeddings.pt'
embeddings = torch.load(file_path1, map_location=torch.device('cpu'), weights_only = True)
print("Loaded embeddings shape:", embeddings.shape)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded embeddings shape: torch.Size([4370, 768])


In [26]:
import pandas as pd
file_path2 = '/content/drive/MyDrive/finalDataSet.csv'
df = pd.read_csv(file_path2)
Y = df['MisconceptionId'].astype(int)
print(Y[:5])
Y_tensor = torch.tensor(Y.values) if isinstance(Y, pd.Series) else torch.tensor(Y)

train_embeddings, val_test_embeddings, train_Y, val_test_Y = train_test_split(
    embeddings, Y_tensor, test_size=0.2, random_state=42
)

# Then, split the 20% into 10% validation and 10% test
val_embeddings, test_embeddings, val_Y, test_Y = train_test_split(
    val_test_embeddings, val_test_Y, test_size=0.5, random_state=42
)


0    2142
1    1287
2    1180
3     686
4     329
Name: MisconceptionId, dtype: int64


In [36]:
import torch.nn as nn

# Define a simple linear classifier model
class LinearClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

# Get the input size from embeddings and the number of classes from Y
input_dim = embeddings.shape[1]  # Embedding dimension
max_label_value = Y_tensor.max().item()

# Initialize the model
model = LinearClassifier(input_dim=input_dim, output_dim=max_label_value + 1)


In [37]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

learning_rate = 1e-4
batch_size = 32
num_epochs = 250

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    epoch_loss = 0.0
    model.train()  # Set the model to training mode

    # Shuffle training data at the beginning of each epoch
    perm = torch.randperm(train_embeddings.shape[0])
    train_embeddings_shuffled = train_embeddings[perm]
    train_Y_shuffled = train_Y[perm]

    # Process inputs in batches for training
    for i in range(0, train_embeddings.shape[0], batch_size):
        batch_embeddings = train_embeddings_shuffled[i:i + batch_size]
        batch_labels = train_Y_shuffled[i:i + batch_size]

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_embeddings)

        # Compute loss
        loss = criterion(outputs, batch_labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        epoch_loss += loss.item()

    # Validation every 10 epochs
    if (epoch + 1) % 25 == 0:
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            val_outputs = model(val_embeddings)
            _, val_preds = torch.max(val_outputs, dim=1)
            val_accuracy = accuracy_score(val_Y.cpu(), val_preds.cpu())  # Calculate accuracy
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

# Evaluate the model on the test set
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    test_outputs = model(test_embeddings)
    _, test_preds = torch.max(test_outputs, dim=1)
    test_accuracy = accuracy_score(test_Y.cpu(), test_preds.cpu())  # Calculate accuracy
print(f"Test Accuracy: {test_accuracy:.4f}")


Epoch [25/250], Loss: 250.0403, Validation Accuracy: 0.2151
Epoch [50/250], Loss: 118.6969, Validation Accuracy: 0.2334
Epoch [75/250], Loss: 78.3719, Validation Accuracy: 0.2311
Epoch [100/250], Loss: 58.9213, Validation Accuracy: 0.2494
Epoch [125/250], Loss: 47.4682, Validation Accuracy: 0.2563
Epoch [150/250], Loss: 39.2452, Validation Accuracy: 0.2494
Epoch [175/250], Loss: 33.5048, Validation Accuracy: 0.2563
Epoch [200/250], Loss: 29.2119, Validation Accuracy: 0.2586
Epoch [225/250], Loss: 25.9719, Validation Accuracy: 0.2586
Epoch [250/250], Loss: 23.2783, Validation Accuracy: 0.2586
Test Accuracy: 0.2609
