In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from Prepare_dataset import prepare_dataset, prepare_dataset_fixedsize


class CompareModel(nn.Module):
    def __init__(self, emb_dim, dense_size, dropout, output_size):
        super().__init__()
        
        # Define the layers
        self.conv1 = nn.Conv1d(emb_dim, 32, kernel_size=5, padding=2)  # Assuming input channels=1
        self.conv2 = nn.Conv1d(32, 64, kernel_size=5, padding=2)
        self.maxpool = nn.MaxPool1d(kernel_size=3)
        self.global_avgpool = nn.AdaptiveAvgPool1d(1)  # Global average pooling
        self.dense1 = nn.Linear(64, dense_size)
        self.dropout = nn.Dropout(dropout)
        self.dense2 = nn.Linear(dense_size, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        # Input shape: (batch_size, seq_len, geom_vector_len)
        # Convolutional layers
        x = x.permute(0, 2, 1)  # Permute to (batch_size, channels, seq_len)
        x = self.conv1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.global_avgpool(x)
        
        # Flatten
        x = x.view(x.size(0), -1)  # Reshape to (batch_size, num_features)
        
        # Fully connected layers
        x = self.dense1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.dense2(x)
        
        # No need to add softmax (already included in CrossEntropyLossFunction), otherwise it will be double softmax and converge slower

        return x

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# import numpy as np

# def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0):
#     if maxlen is None:
#         maxlen = max(len(seq) for seq in sequences)

#     padded_sequences = []
#     for seq in sequences:
#         if len(seq) >= maxlen:
#             if truncating == 'pre':
#                 padded_seq = seq[-maxlen:]
#             else:
#                 padded_seq = seq[:maxlen]
#         else:
#             if padding == 'pre':
#                 padded_seq = [value] * (maxlen - len(seq)) + seq
#             else:
#                 padded_seq = seq + [value] * (maxlen - len(seq))
#         padded_sequences.append(padded_seq)
    
#     return np.array(padded_sequences)

# # Example usage
# sequences = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
# padded_sequences = pad_sequences(sequences, maxlen=5, padding='post', value=0)

# print(padded_sequences)

[[1 2 3 0 0]
 [4 5 0 0 0]
 [6 7 8 9 0]]


In [6]:
# import numpy as np

# train_loaded = np.load("archaeology_train_v8.npz", allow_pickle=True)
# train_geoms = train_loaded['geoms']
# train_labels = train_loaded['feature_type']

# batch_size = 32
# dataset_size = 1000
# train_geoms = train_geoms[:1000]
# train_labels = train_labels[:1000]

# # Normalize
# import geom_scaler

# gs = geom_scaler.scale(train_geoms)
# train_geoms = geom_scaler.transform(train_geoms, gs)

In [None]:
# zipped = zip(train_geoms, train_labels)
# train_input_sorted = {}
# train_labels_sorted = {}

# for geom, label in sorted(zipped, key=lambda x: len(x[0]), reverse=True):
#     sequence_len = geom.shape[0]
#     smallest_size_subset = sorted(train_input_sorted.keys())[0] if train_input_sorted else None

#     if not smallest_size_subset:  # This is the first data point
#         train_input_sorted[sequence_len] = [geom]
#         train_labels_sorted[sequence_len] = [label]
#         continue

#     if sequence_len in train_input_sorted:  # the entry exists, append
#         train_input_sorted[sequence_len].append(geom)
#         train_labels_sorted[sequence_len].append(label)
#         continue

#     # the size subset does not exist yet
#     # append the data to the smallest size subset if it isn't batch-sized yet
#     if len(train_input_sorted[smallest_size_subset]) < batch_size:
#         print(geom)
#         geom = pad_sequences([geom], smallest_size_subset)[0]  # make it the same size as the rest in the subset
#         train_input_sorted[smallest_size_subset].append(geom)
#         train_labels_sorted[smallest_size_subset].append(label)
#     else:
#         train_input_sorted[sequence_len] = [geom]
#         train_labels_sorted[sequence_len] = [label]

In [3]:
import pandas as pd
import numpy as np
from deep_geometry import vectorizer as gv
from deep_geometry import GeomScaler


max_seq_len = 64
batch_size = 32


geom_train, geom_test, label_train, label_test = prepare_dataset_fixedsize()

train_tokens = torch.tensor(geom_train, dtype=torch.float32)
test_tokens = torch.tensor(geom_test, dtype=torch.float32)
train_labels= torch.tensor(label_train, dtype=torch.long)
test_labels = torch.tensor(label_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(train_tokens, train_labels), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(test_tokens, test_labels), batch_size=32)

In [4]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = True if torch.cuda.is_available() else "cpu"

In [6]:
# Create training data
geom_vector_len = 5  # Assuming geom_vector_len is known
dense_size = 64  # Size of the dense layer
dropout = 0.5  # Dropout rate
num_classes = 9  # Number of output classes

# Define the model, loss function, and optimizer
conv_model = CompareModel(emb_dim=geom_vector_len, dense_size=dense_size, dropout=dropout, output_size=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(conv_model.parameters(), lr=0.001)

# Training process
num_epochs = 100

# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for seq_len in train_input_sorted:
#         inputs = torch.tensor(train_input_sorted[seq_len], dtype=torch.float32)
#         labels = torch.tensor(train_labels_sorted[seq_len], dtype=torch.long)
#         dataset = TensorDataset(inputs, labels)
#         loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
#         for batch_x, batch_y in loader:
#             optimizer.zero_grad()
#             output = conv_model(batch_x)
#             loss = criterion(output, batch_y)
#             loss.backward()
#             optimizer.step()
#             # Print statistics
#             running_loss += loss.item()
#     print(f"Epoch {epoch+1}, Loss: {running_loss/train_geoms.shape[0]}")

def train(model, loader):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for batch_x, batch_y in loader:
        if USE_GPU:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_loss /= len(loader)
    train_acc = correct / total
    return train_loss, train_acc

def evaluate(model, loader):
    model.eval()
    eval_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_x, batch_y in loader:
            if USE_GPU:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            eval_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    eval_loss /= len(loader)
    eval_acc = correct / total
    return eval_loss, eval_acc

for epoch in range(num_epochs):
    train_loss, train_acc = train(conv_model, train_loader)
    val_loss, val_acc = evaluate(conv_model, val_loader)
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss}, Train Acc {train_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}")

Epoch: 1, Train Loss: 2.179936170578003, Train Acc 0.1375, Val Loss: 2.1592420169285367, Val Acc: 0.105
Epoch: 2, Train Loss: 2.109309597015381, Train Acc 0.15625, Val Loss: 2.088898471423558, Val Acc: 0.16
Epoch: 3, Train Loss: 2.062109155654907, Train Acc 0.15125, Val Loss: 2.033884252820696, Val Acc: 0.21
Epoch: 4, Train Loss: 2.0190877771377562, Train Acc 0.20125, Val Loss: 2.0060332162039622, Val Acc: 0.19
Epoch: 5, Train Loss: 2.0122658586502076, Train Acc 0.23, Val Loss: 2.006356017930167, Val Acc: 0.2
Epoch: 6, Train Loss: 1.9931556606292724, Train Acc 0.23125, Val Loss: 1.9960494552339827, Val Acc: 0.215
Epoch: 7, Train Loss: 1.9912152910232543, Train Acc 0.21, Val Loss: 1.9718333312443324, Val Acc: 0.215
Epoch: 8, Train Loss: 1.9654785919189453, Train Acc 0.2475, Val Loss: 1.9672352245875768, Val Acc: 0.225
Epoch: 9, Train Loss: 1.9493554019927979, Train Acc 0.2375, Val Loss: 1.9623815332140242, Val Acc: 0.215
Epoch: 10, Train Loss: 1.93159939289093, Train Acc 0.2375, Val Los