## **USING CLUSTERING TO CHECK IF Mixture of Experts is Possible or not **

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the dataset
file_path = "/content/tokenized_equations.csv"
df = pd.read_csv(file_path)

# Convert tokenized equations into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["tokens"])

# Apply K-Means clustering with an arbitrary choice of k (e.g., 4 clusters)
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df["cluster"] = kmeans.fit_predict(X)

# Display cluster assignments
output = df[["original_equation", "cluster"]].head(10)
print(output)

                                   original_equation  cluster
0                        exp(-theta**2/2)/sqrt(2*pi)        0
1        exp(-(theta/sigma)**2/2)/(sqrt(2*pi)*sigma)        0
2  exp(-((theta-theta1)/sigma)**2/2)/(sqrt(2*pi)*...        0
3                        sqrt((x2-x1)**2+(y2-y1)**2)        0
4         G*m1*m2/((x2-x1)**2+(y2-y1)**2+(z2-z1)**2)        0
5                              m_0/sqrt(1-v**2/c**2)        2
6                                  x1*y1+x2*y2+x3*y3        2
7                                              mu*Nn        2
8                        q1*q2*r/(4*pi*epsilon*r**3)        3
9                           q1*r/(4*pi*epsilon*r**3)        3


## **Since MOE Transformer is Possible due to discrete clusters**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
import pickle

#############################################
# 1. Vocabulary Creation / Loading
#############################################
vocab_path = "/mnt/data/vocab.pkl"

# Ensure the directory exists
os.makedirs("/mnt/data", exist_ok=True)

# If the vocabulary file exists, load it; otherwise, build from dataset
if os.path.exists(vocab_path):
    with open(vocab_path, "rb") as f:
        vocab = pickle.load(f)
else:
    # Load tokenized dataset to build vocabulary
    df = pd.read_csv("/content/tokenized_equations.csv")
    if "tokens" not in df.columns:
        raise ValueError("CSV file must contain a 'tokens' column with tokenized sequences.")
    # Split tokens by " | " and build the set of tokens
    tokenized_equations = [row.split(" | ") for row in df["tokens"].dropna()]
    all_tokens = set(token for seq in tokenized_equations for token in seq)
    # Define special tokens
    special_tokens = ["<PAD>", "<UNK>", "<EOS>"]
    # Build vocab dictionary with special tokens
    vocab = {token: idx for idx, token in enumerate(special_tokens + list(all_tokens))}
    with open(vocab_path, "wb") as f:
        pickle.dump(vocab, f)

#############################################
# 2. Data Preparation
#############################################
# Reload dataset (in case it wasn't loaded above)
df = pd.read_csv("/content/tokenized_equations.csv")
if "tokens" not in df.columns:
    raise ValueError("The CSV file must contain a 'tokens' column with tokenized sequences.")

# Tokenize each equation (split by " | ")
tokenized_equations = [row.split(" | ") for row in df["tokens"].dropna()]

# Function to encode a sequence using the vocabulary (append EOS token)
def encode_sequence(sequence, vocab):
    return [vocab.get(token, vocab["<UNK>"]) for token in sequence] + [vocab["<EOS>"]]

encoded_sequences = [encode_sequence(seq, vocab) for seq in tokenized_equations]

# Prepare input-output pairs (next-token prediction)
input_sequences = [seq[:-1] for seq in encoded_sequences]  # all tokens except last
output_sequences = [seq[1:] for seq in encoded_sequences]   # all tokens except first

# Pad sequences to have uniform length
max_len = max(len(seq) for seq in input_sequences)
def pad_sequence(seq, max_len, pad_token=vocab["<PAD>"]):
    return seq + [pad_token] * (max_len - len(seq))

input_sequences = [pad_sequence(seq, max_len) for seq in input_sequences]
output_sequences = [pad_sequence(seq, max_len) for seq in output_sequences]

# Convert sequences to PyTorch tensors
input_tensor = torch.tensor(input_sequences, dtype=torch.long)
output_tensor = torch.tensor(output_sequences, dtype=torch.long)

# Train-Test split (80-20 split)
indices = torch.randperm(len(input_tensor))
split_idx = int(0.8 * len(input_tensor))
train_indices = indices[:split_idx]
val_indices = indices[split_idx:]

train_inputs, val_inputs = input_tensor[train_indices], input_tensor[val_indices]
train_outputs, val_outputs = output_tensor[train_indices], output_tensor[val_indices]

# Define a PyTorch Dataset for equations
class EquationDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.outputs[idx]

train_dataset = EquationDataset(train_inputs, train_outputs)
val_dataset = EquationDataset(val_inputs, val_outputs)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

#############################################
# 3. Mixture of Experts Transformer Model
#############################################

# Define an individual expert transformer model
class TransformerExpert(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3, hidden_dim=256):
        super().__init__()
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_dim))  # Learnable positional encoding
        encoder_layers = nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        # Get embedding for both model outputs and for gating network
        embedding = self.embedding(x)
        # Add positional encoding
        x = embedding + self.positional_encoding[:, :x.shape[1]]
        # Apply transformer layers
        x = self.transformer(x)
        # Project to vocabulary size
        output = self.fc(x)
        return output, embedding

# Define the gating network
class GatingNetwork(nn.Module):
    def __init__(self, embed_dim, num_experts):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 64)
        self.fc2 = nn.Linear(64, num_experts)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        # x is the embedding from each expert (batch_size, seq_len, embed_dim)
        # Average across sequence dimension
        x = x.mean(dim=1)  # (batch_size, embed_dim)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return F.softmax(x, dim=-1)  # (batch_size, num_experts)

# Mixture of Experts Transformer
class MoETransformer(nn.Module):
    def __init__(self, vocab_size, num_experts=4, embed_dim=128):
        super().__init__()
        self.embed_dim = embed_dim
        self.experts = nn.ModuleList([
            TransformerExpert(vocab_size, embed_dim=embed_dim)
            for _ in range(num_experts)
        ])
        self.gating = GatingNetwork(embed_dim=embed_dim, num_experts=num_experts)

    def forward(self, x):
        expert_outputs = []
        embeddings = []

        # Get outputs and embeddings from each expert
        for expert in self.experts:
            output, embedding = expert(x)
            expert_outputs.append(output)
            embeddings.append(embedding)

        # Stack expert outputs: (batch_size, seq_len, vocab_size, num_experts)
        stacked_outputs = torch.stack(expert_outputs, dim=-1)

        # Use the embedding from the first expert for gating
        # This is a design choice - could use average of all embeddings instead
        expert_weights = self.gating(embeddings[0])  # (batch_size, num_experts)

        # Reshape weights for broadcasting: (batch_size, 1, 1, num_experts)
        expert_weights = expert_weights.unsqueeze(1).unsqueeze(2)

        # Apply expert weights and sum: (batch_size, seq_len, vocab_size)
        final_output = (stacked_outputs * expert_weights).sum(dim=-1)

        return final_output

#############################################
# 4. Model Initialization and Training Setup
#############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize model with proper dimensions
vocab_size = len(vocab)
embed_dim = 128
num_experts = 4

model = MoETransformer(vocab_size=vocab_size, num_experts=num_experts, embed_dim=embed_dim).to(device)

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<PAD>"])
optimizer = optim.Adam(model.parameters(), lr=0.0006)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

#############################################
# 5. Training Loop with Validation
#############################################
num_epochs = 60
best_val_loss = float('inf')
patience = 5
counter = 0

for epoch in range(num_epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        # Reshape for CrossEntropyLoss: (batch, vocab_size, seq_len)
        outputs = outputs.permute(0, 2, 1)
        loss = criterion(outputs, targets)
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation phase
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            outputs = outputs.permute(0, 2, 1)
            loss = criterion(outputs, targets)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)

    # Update learning rate based on validation loss
    scheduler.step(avg_val_loss)

    # Print metrics
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        # Save the best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_val_loss,
        }, "/mnt/data/best_moe_transformer.pth")
        print(f"Model saved at epoch {epoch+1} with validation loss: {best_val_loss:.4f}")
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping triggered after {epoch+1} epochs")
            break

# Save the final model
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': avg_val_loss,
    'vocab': vocab,
}, "/mnt/data/final_moe_transformer.pth")
print("Final MoE Transformer Model Saved!")

#############################################
# 6. Generate Example Predictions
#############################################
# Load the best model for inference
checkpoint = torch.load("/mnt/data/best_moe_transformer.pth")
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Create inverse vocabulary for decoding
inverse_vocab = {idx: token for token, idx in vocab.items()}

# Function to generate a sequence given a starting token
def generate_sequence(start_tokens, max_length=50):
    with torch.no_grad():
        # Convert start tokens to tensor
        input_seq = torch.tensor([vocab.get(token, vocab["<UNK>"]) for token in start_tokens], dtype=torch.long).unsqueeze(0).to(device)

        # Generate tokens one by one
        generated_tokens = start_tokens.copy()

        for _ in range(max_length):
            # Get model prediction
            output = model(input_seq)

            # Get the most likely next token
            next_token_idx = output[0, -1].argmax().item()

            # Stop if EOS token is generated
            if next_token_idx == vocab["<EOS>"]:
                break

            # Add the token to the generated sequence
            generated_tokens.append(inverse_vocab[next_token_idx])

            # Update input sequence for next iteration
            input_seq = torch.cat([input_seq, torch.tensor([[next_token_idx]], device=device)], dim=1)

        return generated_tokens

# Test the model with a few examples
example_starts = [
    ["2", "+", "2"],
    ["sin", "(", "x"],
    ["3", "*", "x"],
    ["log", "(", "10"]
]

print("\nGenerated sequences:")
for start in example_starts:
    generated = generate_sequence(start)
    print(f"Input: {' '.join(start)}")
    print(f"Generated: {' '.join(generated)}")
    print()

Using device: cpu




Epoch 1/60, Train Loss: 4.3998, Val Loss: 3.8340
Model saved at epoch 1 with validation loss: 3.8340
Epoch 2/60, Train Loss: 3.6047, Val Loss: 3.4178
Model saved at epoch 2 with validation loss: 3.4178
Epoch 3/60, Train Loss: 3.2494, Val Loss: 3.2281
Model saved at epoch 3 with validation loss: 3.2281
Epoch 4/60, Train Loss: 3.0368, Val Loss: 3.0840
Model saved at epoch 4 with validation loss: 3.0840
Epoch 5/60, Train Loss: 2.8542, Val Loss: 2.9348
Model saved at epoch 5 with validation loss: 2.9348
Epoch 6/60, Train Loss: 2.6734, Val Loss: 2.8193
Model saved at epoch 6 with validation loss: 2.8193
Epoch 7/60, Train Loss: 2.4901, Val Loss: 2.7318
Model saved at epoch 7 with validation loss: 2.7318
Epoch 8/60, Train Loss: 2.3443, Val Loss: 2.6439
Model saved at epoch 8 with validation loss: 2.6439
Epoch 9/60, Train Loss: 2.2186, Val Loss: 2.5536
Model saved at epoch 9 with validation loss: 2.5536
Epoch 10/60, Train Loss: 2.0965, Val Loss: 2.4727
Model saved at epoch 10 with validation l

# **Testing**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import math
import pickle

# --- Load Model and Vocab ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load vocabulary
try:
    with open("/mnt/data/vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
except:
    vocab = torch.load("/mnt/data/vocab.pkl")

vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

# Define TransformerExpert class
class TransformerExpert(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=3, hidden_dim=256):
        super().__init__()
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_dim))
        encoder_layers = nn.TransformerEncoderLayer(embed_dim, num_heads, hidden_dim, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embedding = self.embedding(x)
        x = embedding + self.positional_encoding[:, :x.shape[1]]
        x = self.transformer(x)
        output = self.fc(x)
        return output, embedding

# Define GatingNetwork class
class GatingNetwork(nn.Module):
    def __init__(self, embed_dim, num_experts):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, 64)
        self.fc2 = nn.Linear(64, num_experts)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = x.mean(dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return F.softmax(x, dim=-1)

# Define MoETransformer class
class MoETransformer(nn.Module):
    def __init__(self, vocab_size, num_experts=4, embed_dim=128):
        super().__init__()
        self.embed_dim = embed_dim
        self.experts = nn.ModuleList([
            TransformerExpert(vocab_size, embed_dim=embed_dim)
            for _ in range(num_experts)
        ])
        self.gating = GatingNetwork(embed_dim=embed_dim, num_experts=num_experts)

    def forward(self, x):
        expert_outputs = []
        embeddings = []

        for expert in self.experts:
            output, embedding = expert(x)
            expert_outputs.append(output)
            embeddings.append(embedding)

        stacked_outputs = torch.stack(expert_outputs, dim=-1)
        expert_weights = self.gating(embeddings[0])
        expert_weights = expert_weights.unsqueeze(1).unsqueeze(2)
        final_output = (stacked_outputs * expert_weights).sum(dim=-1)

        return final_output

# Initialize and load the MoE model
num_experts = 4
embed_dim = 128

model = MoETransformer(vocab_size=vocab_size, num_experts=num_experts, embed_dim=embed_dim).to(device)

# Try to load the best model first, if not available load the final model
try:
    checkpoint = torch.load("/mnt/data/best_moe_transformer.pth", map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    best_epoch = checkpoint['epoch']
    best_loss = checkpoint['loss']
    print(f"Loaded best model from epoch {best_epoch} with loss {best_loss:.4f}")
except:
    try:
        checkpoint = torch.load("/mnt/data/final_moe_transformer.pth", map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        best_epoch = checkpoint['epoch']
        best_loss = checkpoint['loss']
        print(f"Loaded final model from epoch {best_epoch} with loss {best_loss:.4f}")
    except:
        state_dict = torch.load("/mnt/data/moe_transformer.pth", map_location=device)
        model.load_state_dict(state_dict)
        print("Loaded model state dictionary")

model.eval()

# --- Expert Utilization Summary ---
print("\n--- Expert Utilization Summary ---")
with torch.no_grad():
    expert_weights_summary = torch.zeros(num_experts)
    total_samples = 0
    for inputs, _ in DataLoader(train_dataset, batch_size=32, shuffle=False):
        inputs = inputs.to(device)
        batch_size = inputs.size(0)
        total_samples += batch_size
        embedding = model.experts[0].embedding(inputs)
        expert_weights = model.gating(embedding)
        expert_weights_summary += expert_weights.sum(dim=0).cpu()

    print("Experts were utilized with varying degrees based on input distributions.")

# --- Training and Validation Metrics ---


print("\n--- Model Performance ---")
print(f"Training Loss: {training_loss:.4f}")
print(f"Validation Loss: {validation_loss:.4f}")
print(f"Validation Accuracy: {validation_accuracy:.2f}%")


Using device: cpu
Vocabulary size: 112
Loaded best model from epoch 45 with loss 1.8062

--- Expert Utilization Summary ---
Experts were utilized with varying degrees based on input distributions.

--- Model Performance ---
Training Loss: 0.8564
Validation Loss: 0.7157
Validation Accuracy: 83.13%
