In [2]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# Building models

In [3]:
class PositionWiseFFN(nn.Module):
    """Same MLP applied to all token(position) representations"""
    def __init__(self, emb_dim, ffn_dim):
        super().__init__()
        self.fc1 = nn.Linear(emb_dim, ffn_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(ffn_dim, emb_dim)
    
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, emb_dim, max_seq_len):
        super().__init__()

        pe = torch.zeros(max_seq_len, emb_dim)
        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2).float() * -(math.log(10000.0) / emb_dim))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadAttention(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        assert emb_dim % num_heads == 0, "Embedding dimension must be divided by number of heads"

        # Dimensions initialization
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        # all features are divided into multi head, each head have a part of features
        self.head_emb_dim = self.emb_dim // self.num_heads

        # Transformation matrixs
        self.W_q = nn.Linear(emb_dim, emb_dim)
        self.W_k = nn.Linear(emb_dim, emb_dim)
        self.W_v = nn.Linear(emb_dim, emb_dim)
        self.W_o = nn.Linear(emb_dim, emb_dim)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_emb_dim)

        # Mask scores (where positions are 0) with near negative inf
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Apply sofxmax to attention scores
        attn_scores = torch.softmax(attn_scores, dim=-1)

        # Get the final output
        output = torch.matmul(attn_scores, V)
        return output
    
    def split(self, x):
        # Reshape the input emb_dim (to multi-head, each head owns a part of input features) for multi-head attention
        batch_size, seq_len, emb_dim = x.size()
        # transpose to fix batch_size and num_heads, let seq_len, head_emb_dim participate in matrix multiplication
        return x.view(batch_size, seq_len, self.num_heads, self.head_emb_dim).transpose(1, 2)

    def combine(self, x):
        batch_size, num_heads, seq_len, head_emb_dim = x.size()
        # contiguous() ensures the memory layout of the tensor is contiguous
        return x.transpose(1, 2).contiguous().view(batch_size, seq_len, self.emb_dim)
    
    def forward(self, Q, K, V, mask=None):
        # Split input to multi heads
        Q = self.split(self.W_q(Q))
        K = self.split(self.W_k(K))
        V = self.split(self.W_v(V))

        # Perform scaled dot-product attention
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        # Combine outputs and apply transformation
        output = self.W_o(self.combine(attn_output))
        return output
    
class EncoderLayer(nn.Module):
    def __init__(self, emb_dim, num_heads, ffn_dim, dropout):
        super().__init__()
        self.self_atten = MultiHeadAttention(emb_dim, num_heads)
        self.ffn = PositionWiseFFN(emb_dim, ffn_dim)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.norm2 = nn.LayerNorm(emb_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_atten(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x

In [4]:
class PolygonEncoder(nn.Module):
    def __init__(self, emb_dim, num_heads,
                num_layers, ffn_dim, max_seq_len, dropout):
        super().__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(emb_dim, num_heads, ffn_dim, dropout) for _ in range(num_layers)])
        self.class_embedding = nn.Parameter(torch.randn(1, 1, emb_dim))
        self.pos_embedding = nn.Parameter(torch.randn(1, 1 + max_seq_len, emb_dim))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # token_mask = (tokens != 0).unsqueeze(1).unsqueeze(2)
        batch_size, seq_len, emb_dim = x.shape
        class_embedding = self.class_embedding.repeat(batch_size, 1, 1)
        x = torch.cat([class_embedding, x], dim=1)
        # print(x.shape, self.pos_embedding[:, :seq_len+1].shape)
        x = x + self.pos_embedding[:, :seq_len+1]
        x = self.dropout(x)

        # Create a new tensor with True values in the first column (for cls token)
        cls_mask = torch.ones((batch_size, 1, 1, 1), dtype=torch.bool)
        mask = torch.cat((cls_mask, mask), dim=3)
        
        for enc_layer in self.encoder_layers:
            x = enc_layer(x, mask)
        
        return x
    
class PolygonTransformer(nn.Module):
    def __init__(self, num_types, emb_dim, num_heads, num_layers, ffn_dim, max_seq_len, dropout):
        super().__init__()
        self.encoder = PolygonEncoder(emb_dim, num_heads, num_layers, ffn_dim, max_seq_len, dropout)
        self.mlp_head = nn.Sequential(nn.Linear(emb_dim, ffn_dim),
                                      nn.ReLU(),
                                      nn.Linear(ffn_dim, num_types))
        
    def forward(self, x, mask=None):
        x = self.encoder(x, mask)
        x = x[:, 0, :] # grab the class embedding
        x = self.mlp_head(x)
        return x

In [13]:
import torch
import torch.nn as nn

class CompareModel(nn.Module):
    def __init__(self, emb_dim, dense_size, dropout, output_size):
        super().__init__()
        
        # Define the layers
        self.conv1 = nn.Conv1d(emb_dim, 32, kernel_size=5, padding=2)  # Assuming input channels=1
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.maxpool = nn.MaxPool1d(kernel_size=3)
        self.global_avgpool = nn.AdaptiveAvgPool1d(1)  # Global average pooling
        self.dense1 = nn.Linear(64, dense_size)
        self.dropout = nn.Dropout(dropout)
        self.dense2 = nn.Linear(dense_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        # Input shape: (batch_size, seq_len, geom_vector_len)
        # Convolutional layers
        x = x.permute(0, 2, 1)  # Permute to (batch_size, channels, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.global_avgpool(x)
        
        # Flatten
        x = x.view(x.size(0), -1)  # Reshape to (batch_size, num_features)
        
        # Fully connected layers
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        
        return x

# Prepare Dataset

In [6]:
import pandas as pd
import numpy as np
from deep_geometry import vectorizer as gv
from deep_geometry import GeomScaler

gs = GeomScaler()
types_dict = {'PK':0, 'MR': 1, 'KL':2, 'NV':3, 'WA':4, 'LG':5, 'HO':6, 'GR':7, 'REC':8, 'PGK':9}
df = pd.read_csv("archaeology.csv")
df['type'] = df['Aardspoor'].map(types_dict)
df = df.dropna().reset_index(drop=True)
# wkts = df['WKT'][:1000].tolist()

In [7]:
def prepare_polygon_dataset(df, dataset_size, max_seq_len): # TODO - 1. split into train, validate, test. 2. randomly sample
    wkts = df['WKT'][:dataset_size]
    types = df['type'][:dataset_size]
    geoms, labels, start_points = [], [], []
    for i, wkt in enumerate(wkts):
        num_point = gv.num_points_from_wkt(wkt)
        if  num_point > max_seq_len:
             continue
        geom = gv.vectorize_wkt(wkt, max_points=max_seq_len, fixed_size=True)
        geoms.append(geom)
        labels.append(types[i])
        start_points.append(num_point)

    start_points = torch.tensor(start_points).unsqueeze(1)
    indices = torch.arange(max_seq_len).unsqueeze(0)
    mask = indices < start_points
    mask = mask.unsqueeze(1).unsqueeze(2)
    tokens = np.stack(geoms, axis=0)
    gs.fit(tokens)
    tokens = gs.transform(tokens)
    tokens = torch.tensor(tokens, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return tokens, labels, mask

In [8]:
dataset_size = 1000
max_seq_len = 64
batch_size = 32

In [9]:
tokens, labels, mask = prepare_polygon_dataset(df, dataset_size, max_seq_len)

In [10]:
dataset = TensorDataset(tokens, mask, labels)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model transformer

In [16]:
pot = PolygonTransformer(num_types=10,
                        emb_dim=7,
                        num_heads=1,
                        num_layers=6,
                        ffn_dim=64, 
                        max_seq_len=64,
                        dropout=0.5)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(pot.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

pot.train()

for epoch in range(20):
    for batch_x, batch_mask, batch_y in loader:
        optimizer.zero_grad()
        output = pot(batch_x, batch_mask)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 2.2751638889312744
Epoch: 2, Loss: 2.267160654067993
Epoch: 3, Loss: 2.076948404312134
Epoch: 4, Loss: 2.061600923538208
Epoch: 5, Loss: 2.050260066986084
Epoch: 6, Loss: 1.950992465019226
Epoch: 7, Loss: 1.757738709449768
Epoch: 8, Loss: 1.6771175861358643
Epoch: 9, Loss: 1.7088524103164673
Epoch: 10, Loss: 1.7894134521484375
Epoch: 11, Loss: 1.5296170711517334
Epoch: 12, Loss: 1.529089093208313
Epoch: 13, Loss: 1.5843652486801147
Epoch: 14, Loss: 1.6824382543563843
Epoch: 15, Loss: 1.491610050201416
Epoch: 16, Loss: 1.16709566116333
Epoch: 17, Loss: 1.4058548212051392
Epoch: 18, Loss: 1.1387109756469727
Epoch: 19, Loss: 1.306123971939087
Epoch: 20, Loss: 1.4469846487045288


# Model Conv

In [12]:
tokens.shape

torch.Size([894, 64, 7])

In [15]:
# Create the model
geom_vector_len = 7  # Assuming geom_vector_len is known
dense_size = 64  # Size of the dense layer
dropout = 0.5  # Dropout rate
output_size = 10  # Number of output classes
conv_model = CompareModel(geom_vector_len, dense_size, dropout, output_size)

# Print the model architecture
# print(model)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(pot.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)

conv_model.train()

for epoch in range(50):
    for batch_x, batch_mask, batch_y in loader:
        optimizer.zero_grad()
        output = conv_model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

Epoch: 1, Loss: 2.3755531311035156
Epoch: 2, Loss: 2.3280155658721924
Epoch: 3, Loss: 2.3263702392578125
Epoch: 4, Loss: 2.328519105911255
Epoch: 5, Loss: 2.367708683013916
Epoch: 6, Loss: 2.300119161605835
Epoch: 7, Loss: 2.3383395671844482
Epoch: 8, Loss: 2.328643560409546
Epoch: 9, Loss: 2.29815411567688
Epoch: 10, Loss: 2.3713440895080566
Epoch: 11, Loss: 2.336233615875244
Epoch: 12, Loss: 2.325115203857422
Epoch: 13, Loss: 2.3378660678863525
Epoch: 14, Loss: 2.3752248287200928
Epoch: 15, Loss: 2.3822243213653564
Epoch: 16, Loss: 2.359609842300415
Epoch: 17, Loss: 2.302586555480957
Epoch: 18, Loss: 2.3237926959991455
Epoch: 19, Loss: 2.3504207134246826
Epoch: 20, Loss: 2.333038568496704
Epoch: 21, Loss: 2.3355369567871094
Epoch: 22, Loss: 2.33678936958313
Epoch: 23, Loss: 2.361912965774536
Epoch: 24, Loss: 2.3079512119293213
Epoch: 25, Loss: 2.3314294815063477
Epoch: 26, Loss: 2.2821238040924072
Epoch: 27, Loss: 2.350801944732666
Epoch: 28, Loss: 2.3137223720550537
Epoch: 29, Loss: