In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
import math
import os

# Load the tokenized dataset
df = pd.read_csv("/content/tokenized_equations.csv")

# Ensure the 'tokens' column is present
if "tokens" not in df.columns:
    raise ValueError("The CSV file must contain a 'tokens' column with tokenized sequences.")

# Tokenize and build vocabulary
tokenized_equations = [row.split(" | ") for row in df["tokens"].dropna()]
all_tokens = [token for seq in tokenized_equations for token in seq]

# Define special tokens
special_tokens = ["<PAD>", "<UNK>", "<EOS>"]
vocab = {token: idx for idx, token in enumerate(set(all_tokens) | set(special_tokens), start=0)}

# Encode tokenized sequences into numerical format
def encode_sequence(sequence, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in sequence] + [vocab["<EOS>"]]  # Append EOS

encoded_sequences = [encode_sequence(seq, vocab) for seq in tokenized_equations]

# Prepare input-output pairs for next-token prediction
input_sequences = [seq[:-1] for seq in encoded_sequences]  # Remove last token
output_sequences = [seq[1:] for seq in encoded_sequences]  # Shift left by 1 token

# Padding sequences to the same length
max_len = max(len(seq) for seq in input_sequences)
def pad_sequence(seq, max_len, pad_token=0):
    return seq + [pad_token] * (max_len - len(seq))

input_sequences = [pad_sequence(seq, max_len) for seq in input_sequences]
output_sequences = [pad_sequence(seq, max_len) for seq in output_sequences]

# Convert to PyTorch tensors
input_tensor = torch.tensor(input_sequences, dtype=torch.long)
output_tensor = torch.tensor(output_sequences, dtype=torch.long)

# Train-test split
split_idx = int(0.8 * len(input_tensor))
train_inputs, val_inputs = input_tensor[:split_idx], input_tensor[split_idx:]
train_outputs, val_outputs = output_tensor[:split_idx], output_tensor[split_idx:]

# PyTorch Dataset & DataLoader
class EquationDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

train_dataset = EquationDataset(train_inputs, train_outputs)
val_dataset = EquationDataset(val_inputs, val_outputs)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Sinusoidal Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.shape[1]]

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, max_len)
        encoder_layers = nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.transformer(x)
        x = self.fc(x)
        return x

# Initialize model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(vocab_size=len(vocab)).to(device)

# Training Setup
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<PAD>"])  # Ignore padding in loss calculation
optimizer = optim.Adam(model.parameters(), lr=0.0006)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9)

# Training Loop
num_epochs = 60   # Increase epochs for better learning

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, targets in tqdm(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs).permute(0, 2, 1)  # Reshape for loss function
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs).permute(0, 2, 1)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {total_loss/len(train_loader):.4f} - Val Loss: {val_loss/len(val_loader):.4f}")

# Save Model & Vocab
os.makedirs("/mnt/data", exist_ok=True)
torch.save(model.state_dict(), "/mnt/data/transformer_model.pth")
torch.save(vocab, "/mnt/data/vocab.pkl")
print("Model saved at /mnt/data/transformer_model.pth")
print("Vocabulary saved at /mnt/data/vocab.pkl")


100%|██████████| 3/3 [00:00<00:00, 12.82it/s]


Epoch 1/60 - Train Loss: 3.9520 - Val Loss: 2.7216


100%|██████████| 3/3 [00:00<00:00, 11.31it/s]


Epoch 2/60 - Train Loss: 2.1771 - Val Loss: 2.1778


100%|██████████| 3/3 [00:00<00:00, 11.97it/s]


Epoch 3/60 - Train Loss: 1.7553 - Val Loss: 1.9370


100%|██████████| 3/3 [00:00<00:00, 13.42it/s]


Epoch 4/60 - Train Loss: 1.5909 - Val Loss: 1.7249


100%|██████████| 3/3 [00:00<00:00, 12.75it/s]


Epoch 5/60 - Train Loss: 1.4084 - Val Loss: 1.5423


100%|██████████| 3/3 [00:00<00:00, 12.63it/s]


Epoch 6/60 - Train Loss: 1.2783 - Val Loss: 1.4258


100%|██████████| 3/3 [00:00<00:00, 12.52it/s]


Epoch 7/60 - Train Loss: 1.1367 - Val Loss: 1.3559


100%|██████████| 3/3 [00:00<00:00, 13.23it/s]


Epoch 8/60 - Train Loss: 1.0903 - Val Loss: 1.2792


100%|██████████| 3/3 [00:00<00:00, 13.16it/s]


Epoch 9/60 - Train Loss: 1.0389 - Val Loss: 1.2156


100%|██████████| 3/3 [00:00<00:00, 12.65it/s]


Epoch 10/60 - Train Loss: 0.9910 - Val Loss: 1.1624


100%|██████████| 3/3 [00:00<00:00, 12.66it/s]


Epoch 11/60 - Train Loss: 0.9506 - Val Loss: 1.1219


100%|██████████| 3/3 [00:00<00:00, 12.65it/s]


Epoch 12/60 - Train Loss: 0.9378 - Val Loss: 1.0924


100%|██████████| 3/3 [00:00<00:00, 13.06it/s]


Epoch 13/60 - Train Loss: 0.8539 - Val Loss: 1.0647


100%|██████████| 3/3 [00:00<00:00, 12.42it/s]


Epoch 14/60 - Train Loss: 0.8334 - Val Loss: 1.0411


100%|██████████| 3/3 [00:00<00:00, 12.79it/s]


Epoch 15/60 - Train Loss: 0.8301 - Val Loss: 1.0208


100%|██████████| 3/3 [00:00<00:00, 13.62it/s]


Epoch 16/60 - Train Loss: 0.8286 - Val Loss: 1.0037


100%|██████████| 3/3 [00:00<00:00, 13.30it/s]


Epoch 17/60 - Train Loss: 0.7526 - Val Loss: 0.9837


100%|██████████| 3/3 [00:00<00:00, 12.03it/s]


Epoch 18/60 - Train Loss: 0.7517 - Val Loss: 0.9672


100%|██████████| 3/3 [00:00<00:00, 11.98it/s]


Epoch 19/60 - Train Loss: 0.7390 - Val Loss: 0.9503


100%|██████████| 3/3 [00:00<00:00, 13.09it/s]


Epoch 20/60 - Train Loss: 0.7202 - Val Loss: 0.9340


100%|██████████| 3/3 [00:00<00:00, 12.49it/s]


Epoch 21/60 - Train Loss: 0.7061 - Val Loss: 0.9213


100%|██████████| 3/3 [00:00<00:00, 12.23it/s]


Epoch 22/60 - Train Loss: 0.6643 - Val Loss: 0.9065


100%|██████████| 3/3 [00:00<00:00, 12.70it/s]


Epoch 23/60 - Train Loss: 0.6497 - Val Loss: 0.8938


100%|██████████| 3/3 [00:00<00:00, 12.92it/s]


Epoch 24/60 - Train Loss: 0.6620 - Val Loss: 0.8855


100%|██████████| 3/3 [00:00<00:00, 13.00it/s]


Epoch 25/60 - Train Loss: 0.6470 - Val Loss: 0.8769


100%|██████████| 3/3 [00:00<00:00, 12.60it/s]


Epoch 26/60 - Train Loss: 0.6320 - Val Loss: 0.8701


100%|██████████| 3/3 [00:00<00:00, 12.51it/s]


Epoch 27/60 - Train Loss: 0.6252 - Val Loss: 0.8620


100%|██████████| 3/3 [00:00<00:00, 12.28it/s]


Epoch 28/60 - Train Loss: 0.6138 - Val Loss: 0.8553


100%|██████████| 3/3 [00:00<00:00, 13.24it/s]


Epoch 29/60 - Train Loss: 0.6074 - Val Loss: 0.8504


100%|██████████| 3/3 [00:00<00:00, 12.24it/s]


Epoch 30/60 - Train Loss: 0.6095 - Val Loss: 0.8434


100%|██████████| 3/3 [00:00<00:00, 12.49it/s]


Epoch 31/60 - Train Loss: 0.5853 - Val Loss: 0.8358


100%|██████████| 3/3 [00:00<00:00, 12.97it/s]


Epoch 32/60 - Train Loss: 0.5936 - Val Loss: 0.8298


100%|██████████| 3/3 [00:00<00:00, 12.65it/s]


Epoch 33/60 - Train Loss: 0.5664 - Val Loss: 0.8227


100%|██████████| 3/3 [00:00<00:00, 12.21it/s]


Epoch 34/60 - Train Loss: 0.5532 - Val Loss: 0.8170


100%|██████████| 3/3 [00:00<00:00, 12.65it/s]


Epoch 35/60 - Train Loss: 0.5334 - Val Loss: 0.8122


100%|██████████| 3/3 [00:00<00:00, 12.69it/s]


Epoch 36/60 - Train Loss: 0.5223 - Val Loss: 0.8049


100%|██████████| 3/3 [00:00<00:00, 12.45it/s]


Epoch 37/60 - Train Loss: 0.5281 - Val Loss: 0.7991


100%|██████████| 3/3 [00:00<00:00,  9.01it/s]


Epoch 38/60 - Train Loss: 0.5203 - Val Loss: 0.7964


100%|██████████| 3/3 [00:00<00:00,  9.51it/s]


Epoch 39/60 - Train Loss: 0.5144 - Val Loss: 0.7939


100%|██████████| 3/3 [00:00<00:00,  9.09it/s]


Epoch 40/60 - Train Loss: 0.5107 - Val Loss: 0.7872


100%|██████████| 3/3 [00:00<00:00, 10.11it/s]


Epoch 41/60 - Train Loss: 0.4818 - Val Loss: 0.7821


100%|██████████| 3/3 [00:00<00:00,  9.15it/s]


Epoch 42/60 - Train Loss: 0.4566 - Val Loss: 0.7817


100%|██████████| 3/3 [00:00<00:00,  9.81it/s]


Epoch 43/60 - Train Loss: 0.4779 - Val Loss: 0.7780


100%|██████████| 3/3 [00:00<00:00, 12.48it/s]


Epoch 44/60 - Train Loss: 0.4794 - Val Loss: 0.7752


100%|██████████| 3/3 [00:00<00:00, 12.47it/s]


Epoch 45/60 - Train Loss: 0.4769 - Val Loss: 0.7744


100%|██████████| 3/3 [00:00<00:00, 12.74it/s]


Epoch 46/60 - Train Loss: 0.4605 - Val Loss: 0.7692


100%|██████████| 3/3 [00:00<00:00, 12.97it/s]


Epoch 47/60 - Train Loss: 0.4432 - Val Loss: 0.7657


100%|██████████| 3/3 [00:00<00:00, 12.83it/s]


Epoch 48/60 - Train Loss: 0.4324 - Val Loss: 0.7653


100%|██████████| 3/3 [00:00<00:00, 13.06it/s]


Epoch 49/60 - Train Loss: 0.4215 - Val Loss: 0.7673


100%|██████████| 3/3 [00:00<00:00, 13.08it/s]


Epoch 50/60 - Train Loss: 0.4059 - Val Loss: 0.7579


100%|██████████| 3/3 [00:00<00:00, 13.10it/s]


Epoch 51/60 - Train Loss: 0.4142 - Val Loss: 0.7537


100%|██████████| 3/3 [00:00<00:00, 13.30it/s]


Epoch 52/60 - Train Loss: 0.4089 - Val Loss: 0.7549


100%|██████████| 3/3 [00:00<00:00, 12.15it/s]


Epoch 53/60 - Train Loss: 0.3833 - Val Loss: 0.7579


100%|██████████| 3/3 [00:00<00:00, 12.48it/s]


Epoch 54/60 - Train Loss: 0.3920 - Val Loss: 0.7575


100%|██████████| 3/3 [00:00<00:00, 12.62it/s]


Epoch 55/60 - Train Loss: 0.3872 - Val Loss: 0.7522


100%|██████████| 3/3 [00:00<00:00, 12.62it/s]


Epoch 56/60 - Train Loss: 0.3784 - Val Loss: 0.7492


100%|██████████| 3/3 [00:00<00:00, 12.46it/s]


Epoch 57/60 - Train Loss: 0.3568 - Val Loss: 0.7554


100%|██████████| 3/3 [00:00<00:00, 13.29it/s]


Epoch 58/60 - Train Loss: 0.3681 - Val Loss: 0.7552


100%|██████████| 3/3 [00:00<00:00, 12.09it/s]


Epoch 59/60 - Train Loss: 0.3631 - Val Loss: 0.7515


100%|██████████| 3/3 [00:00<00:00, 12.88it/s]

Epoch 60/60 - Train Loss: 0.3559 - Val Loss: 0.7531
Model saved at /mnt/data/transformer_model.pth
Vocabulary saved at /mnt/data/vocab.pkl





In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# --- Load Model and Vocab ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load vocabulary
vocab = torch.load("/mnt/data/vocab.pkl")
vocab_size = len(vocab)

# Define Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe = torch.zeros(1, max_len, embed_dim)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.shape[1]]

# Define Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim, max_len=46)  # Ensure correct max_len
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, batch_first=True),
            num_layers
        )
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.transformer(x)
        return self.fc(x)

# Load Model
model = TransformerModel(vocab_size).to(device)
model.load_state_dict(torch.load("/mnt/data/transformer_model.pth", map_location=device))
model.eval()

# --- Load Data ---
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)  # Ensure train_loader is defined
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# --- Define Loss Function ---
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<PAD>"])  # Ignore padding tokens

# --- Evaluate Training Loss ---
total_train_loss = 0
model.eval()

with torch.no_grad():
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)  # (batch, seq_len, vocab_size)
        loss = criterion(outputs.permute(0, 2, 1), targets)  # Reshape for loss calculation
        total_train_loss += loss.item()

avg_train_loss = total_train_loss / len(train_loader)

# --- Evaluate Validation Loss & Accuracy ---
total_val_loss, total_correct, total_tokens = 0, 0, 0

with torch.no_grad():
    for inputs, targets in val_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        loss = criterion(outputs.permute(0, 2, 1), targets)
        total_val_loss += loss.item()

        # Compute accuracy (ignoring PAD tokens)
        predictions = torch.argmax(outputs, dim=-1)
        mask = targets != vocab["<PAD>"]
        total_correct += (predictions == targets).masked_select(mask).sum().item()
        total_tokens += mask.sum().item()

avg_val_loss = total_val_loss / len(val_loader)
accuracy = total_correct / total_tokens * 100

# --- Print Evaluation Results ---
print(f"Training Loss: {avg_train_loss:.4f}")
print(f"Validation Loss: {avg_val_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.2f}%")


Training Loss: 0.3006
Validation Loss: 0.7531
Validation Accuracy: 80.98%


  vocab = torch.load("/mnt/data/vocab.pkl")
  model.load_state_dict(torch.load("/mnt/data/transformer_model.pth", map_location=device))
