## Polygon Transformer using Pytorch Transformer Encoder Module

Note that mask is different here, only (batch_size, seq_len) mask where True stands for invalid (mask) attention queries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import importlib
importlib.import_module("utils")
from utils.prepare_dataset import prepare_dataset, prepare_dataset_fixedsize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = True if torch.cuda.is_available() else "cpu"

In [3]:
class Pot(nn.Module):
    def __init__(self, fea_dim=7, d_model=30, nhead=1, num_layers=3, max_seq_len=64, dim_feedforward=64, dropout=0.1, num_types=10):
        super().__init__()

        self.class_embedding = nn.Parameter(torch.randn(1, 1, d_model))
        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + max_seq_len, d_model))
        self.dropout = nn.Dropout(dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
                                                dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.mlp_head = nn.Sequential(nn.Linear(d_model, dim_feedforward),
                                      nn.ReLU(),
                                      nn.Linear(dim_feedforward, num_types))
        self.projection = nn.Linear(fea_dim, d_model)

    def forward(self, x, mask=None, pre_train=False):
        batch_size, seq_len, emb_dim = x.shape
        x = self.projection(x)
        class_embedding = self.class_embedding.repeat(batch_size, 1, 1)
        x = torch.cat([class_embedding, x], dim=1)
        # print(x.shape, self.pos_embedding[:, :seq_len+1].shape)
        x = x + self.pos_embedding[:, :seq_len+1]
        x = self.dropout(x)

        # x = self.projection(x)

        # Create a new tensor with True values in the first column (for cls token)
        if mask is not None:
            cls_mask = torch.zeros((x.size(0), 1), dtype=torch.bool)
            if USE_GPU:
                cls_mask = cls_mask.to(device)
            mask = torch.cat((cls_mask, mask), dim=1)
        
        x = self.transformer_encoder(x, src_key_padding_mask=mask)

        if not pre_train:
            x = x[:, 0, :] # grab the class embedding
            x = self.mlp_head(x)
        else:
            x = x[:, 1:, :]
        
        return x

In [4]:
# Define the classifier
class Classifier(nn.Module):
    def __init__(self, input_size, dense_size, num_classes, dropout):
        super().__init__()
        self.dense1 = nn.Linear(input_size, dense_size)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.dense2 = nn.Linear(dense_size, num_classes)

    def forward(self, x):
        x = self.relu(self.dense1(x))
        x = self.dropout(x)
        x = self.dense2(x)
        return x

In [5]:
import pandas as pd
import numpy as np
from deep_geometry import vectorizer as gv
from deep_geometry import GeomScaler


max_seq_len = 64
batch_size = 64


geom_train, geom_test, label_train, label_test, gs = prepare_dataset_fixedsize(dataset_size=2000)

train_tokens = torch.tensor(geom_train, dtype=torch.float32)
test_tokens = torch.tensor(geom_test, dtype=torch.float32)
train_labels= torch.tensor(label_train, dtype=torch.long)
test_labels = torch.tensor(label_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(train_tokens, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(test_tokens, test_labels), batch_size=batch_size)

In [20]:
pot = Pot(fea_dim=5, d_model=32, nhead=8, num_layers=1, max_seq_len=64, dim_feedforward=32, dropout=0.1, num_types=9)

if USE_GPU:
    pot = pot.to(device)

criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(pot.parameters(), lr=0.004)
optimizer = optim.Adam(pot.parameters(), lr=0.004, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.0001)

num_epochs = 300

def train(model, loader):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_x, batch_y in loader:
        if USE_GPU:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = pot(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=-1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_loss /= len(loader)
    train_acc = correct / total
    return train_loss, train_acc

def evaluate(model, loader):
    model.eval()
    eval_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            if USE_GPU:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = pot(batch_x)
            loss = criterion(outputs, batch_y)
            eval_loss += loss.item()
            _, predicted = torch.max(outputs, dim=-1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    eval_loss /= len(loader)
    eval_acc = correct / total
    return eval_loss, eval_acc

for epoch in range(num_epochs):
    train_loss, train_acc = train(pot, train_loader)
    val_loss, val_acc = evaluate(pot, val_loader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Train Acc {train_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}")


# # Test
# test_loss, test_acc = evaluate(pot, test_loader)
# print(f"Test Loss: {test_loss}, Test Acc: {test_acc}")

Epoch: 1, Train Loss: 2.139152002334595, Train Acc 0.13875, Val Loss: 2.1169678483690535, Val Acc: 0.145
Epoch: 2, Train Loss: 2.097270174026489, Train Acc 0.154375, Val Loss: 2.0723682131086076, Val Acc: 0.1525
Epoch: 3, Train Loss: 2.0457476806640624, Train Acc 0.18125, Val Loss: 2.0220488820757185, Val Acc: 0.1975
Epoch: 4, Train Loss: 1.9825956678390504, Train Acc 0.213125, Val Loss: 1.9853921617780412, Val Acc: 0.195
Epoch: 5, Train Loss: 1.9659615755081177, Train Acc 0.20875, Val Loss: 1.9311999593462263, Val Acc: 0.2275
Epoch: 6, Train Loss: 1.9479502248764038, Train Acc 0.240625, Val Loss: 1.8938300439289637, Val Acc: 0.275
Epoch: 7, Train Loss: 1.9116272449493408, Train Acc 0.24125, Val Loss: 1.885062643459865, Val Acc: 0.305
Epoch: 8, Train Loss: 1.8945185375213622, Train Acc 0.255625, Val Loss: 1.9072721004486084, Val Acc: 0.265
Epoch: 9, Train Loss: 1.90451171875, Train Acc 0.275625, Val Loss: 1.8680494172232491, Val Acc: 0.27
Epoch: 10, Train Loss: 1.8773003435134887, Trai

KeyboardInterrupt: 

## Compare to conv model

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from utils.prepare_dataset import prepare_dataset_fixedsize


class CompareModel(nn.Module):
    def __init__(self, emb_dim, dense_size, dropout, output_size):
        super().__init__()
        
        # Define the layers
        self.conv1 = nn.Conv1d(emb_dim, 32, kernel_size=5, padding=2)  # Assuming input channels=1
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.maxpool = nn.MaxPool1d(kernel_size=3)
        self.global_avgpool = nn.AdaptiveAvgPool1d(1)  # Global average pooling
        self.dense1 = nn.Linear(64, dense_size)
        self.dropout = nn.Dropout(dropout)
        self.dense2 = nn.Linear(dense_size, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # Input shape: (batch_size, seq_len, geom_vector_len)
        # Convolutional layers
        x = x.permute(0, 2, 1)  # Permute to (batch_size, channels, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.global_avgpool(x)
        
        # Flatten
        x = x.view(x.size(0), -1)  # Reshape to (batch_size, num_features)
        
        # Fully connected layers
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        
        # No need to add softmax (already included in CrossEntropyLossFunction), otherwise it will be double softmax and converge slower

        return x

In [9]:
# Create training data
geom_vector_len = 5  # Assuming geom_vector_len is known
dense_size = 64  # Size of the dense layer
dropout = 0.5  # Dropout rate
num_classes = 9  # Number of output classes

# Define the model, loss function, and optimizer
conv_model = CompareModel(emb_dim=geom_vector_len, dense_size=dense_size, dropout=dropout, output_size=num_classes)
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(conv_model.parameters(), lr=0.001)
optimizer = optim.Adam(conv_model.parameters(), lr=0.004, betas=(0.9, 0.98), eps=1e-9, weight_decay=0.001)

# Training process
num_epochs = 300

def train(model, loader):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_x, batch_y in loader:
        if USE_GPU:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=-1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_loss /= len(loader)
    train_acc = correct / total
    return train_loss, train_acc

def evaluate(model, loader):
    model.eval()
    eval_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            if USE_GPU:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            eval_loss += loss.item()
            _, predicted = torch.max(outputs, dim=-1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    eval_loss /= len(loader)
    eval_acc = correct / total
    return eval_loss, eval_acc

for epoch in range(num_epochs):
    train_loss, train_acc = train(conv_model, train_loader)
    val_loss, val_acc = evaluate(conv_model, val_loader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Train Acc {train_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}")

Epoch: 1, Train Loss: 2.132039213180542, Train Acc 0.141875, Val Loss: 2.0961855479649136, Val Acc: 0.145
Epoch: 2, Train Loss: 2.0577419900894167, Train Acc 0.200625, Val Loss: 2.0344483171190535, Val Acc: 0.1875
Epoch: 3, Train Loss: 2.020531635284424, Train Acc 0.211875, Val Loss: 2.031015532357352, Val Acc: 0.205
Epoch: 4, Train Loss: 2.0081984186172486, Train Acc 0.2025, Val Loss: 2.010474579674857, Val Acc: 0.1825
Epoch: 5, Train Loss: 1.9823488569259644, Train Acc 0.20875, Val Loss: 1.99062408719744, Val Acc: 0.2225
Epoch: 6, Train Loss: 1.965613226890564, Train Acc 0.216875, Val Loss: 1.9332626376833235, Val Acc: 0.2775
Epoch: 7, Train Loss: 1.9366360330581665, Train Acc 0.243125, Val Loss: 1.9021392038890295, Val Acc: 0.295
Epoch: 8, Train Loss: 1.882493634223938, Train Acc 0.27375, Val Loss: 1.8604224409375871, Val Acc: 0.275
Epoch: 9, Train Loss: 1.8495013809204102, Train Acc 0.305625, Val Loss: 1.8233354091644287, Val Acc: 0.3225
Epoch: 10, Train Loss: 1.8289145469665526, T

KeyboardInterrupt: 

In [29]:
# Save the model
torch.save(pot.state_dict(), 'pot_model.pth')

## Use it as feature extractor (pre-trained)

In [None]:
pot = Pot(d_model=7, nhead=1, num_layers=3, max_seq_len=64, dim_feedforward=64, dropout=0.1, num_types=10)
pot.load_state_dict(torch.load("pot_model.pth"))
pot.eval()

In [32]:
with torch.no_grad():
    train_hidden = pot(train_tokens, train_mask, pre_train=True).view(train_tokens.size(0), -1)
    val_hidden = pot(val_tokens, val_mask, pre_train=True).view(val_tokens.size(0), -1)
    test_hidden = pot(test_tokens, test_mask, pre_train=True).view(test_tokens.size(0), -1)

In [33]:
train_loader = DataLoader(TensorDataset(train_hidden, train_labels), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(val_hidden, val_labels))
test_loader = DataLoader(TensorDataset(test_hidden, test_labels))

In [38]:
classifier = Classifier(train_hidden.size(1), 128, 10, 0.5)

if USE_GPU:
    classifier = classifier.to(device)

criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(pot.parameters(), lr=0.004)
optimizer = optim.Adam(classifier.parameters(), lr=0.01, betas=(0.9, 0.98), eps=1e-9)

num_epochs = 20

def train(model, loader):
    model.eval()
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_x, batch_y in loader:
        if USE_GPU:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_loss /= len(loader)
    train_acc = correct / total
    return train_loss, train_acc

def evaluate(model, loader):
    model.eval()
    eval_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            if USE_GPU:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            eval_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    eval_loss /= len(val_loader)
    eval_acc = correct / total
    return eval_loss, eval_acc

for epoch in range(num_epochs):
    train_loss, train_acc = train(classifier, train_loader)
    val_loss, val_acc = evaluate(classifier, val_loader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Train Acc {train_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}")


# Test
test_loss, test_acc = evaluate(classifier, test_loader)
print(f"Test Loss: {test_loss}, Test Acc: {test_acc}")

Epoch: 1, Train Loss: 1.9963074207305909, Train Acc 0.50625, Val Loss: 1.30093060284853, Val Acc: 0.62
Epoch: 2, Train Loss: 1.2693264412879943, Train Acc 0.5925, Val Loss: 1.0721332759410143, Val Acc: 0.68
Epoch: 3, Train Loss: 1.166937198638916, Train Acc 0.605, Val Loss: 1.0162230451405048, Val Acc: 0.7
Epoch: 4, Train Loss: 1.1719644212722777, Train Acc 0.59125, Val Loss: 1.1128932654857635, Val Acc: 0.71
Epoch: 5, Train Loss: 1.166082215309143, Train Acc 0.6225, Val Loss: 1.0086301210522652, Val Acc: 0.72
Epoch: 6, Train Loss: 1.1365370631217957, Train Acc 0.6125, Val Loss: 1.1069583275914192, Val Acc: 0.66
Epoch: 7, Train Loss: 1.178542513847351, Train Acc 0.60125, Val Loss: 1.2399723929166795, Val Acc: 0.59
Epoch: 8, Train Loss: 1.1612643957138062, Train Acc 0.59125, Val Loss: 1.0868311071395873, Val Acc: 0.69
Epoch: 9, Train Loss: 1.1284274244308472, Train Acc 0.60875, Val Loss: 1.0686833444237709, Val Acc: 0.66
Epoch: 10, Train Loss: 1.1235071158409118, Train Acc 0.61125, Val 