In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt


In [2]:
assert torch.cuda.is_available(), "CUDA GPU not available"
device = torch.device('cuda')
torch.backends.cudnn.benchmark = True

In [3]:
# Load the TSV file, using header=None since there is no header row
df_train = pd.read_csv('ACSF1_TRAIN.tsv', sep='\t', header=None)
df_test  = pd.read_csv('ACSF1_TEST.tsv',  sep='\t', header=None)

In [4]:
#  Separate labels and data of train 
labels_train_cpu = df_train.iloc[:, 0].values        
data_train_cpu = df_train.iloc[:, 1:].values     
print("train Labels shape:", labels_train_cpu.shape)
print("train Data shape: ", data_train_cpu.shape)
# Transfer to GPU as PyTorch tensors
labels_train = torch.tensor(labels_train_cpu, dtype=torch.float32, device=device)
data_train = torch.tensor(data_train_cpu, dtype=torch.float32, device=device)

train Labels shape: (100,)
train Data shape:  (100, 1460)


In [5]:
#  Separate labels and data of test 
labels_test_cpu = df_test.iloc[:, 0].values        
data_test_cpu = df_test.iloc[:, 1:].values     
print("test Labels shape:", labels_test_cpu.shape)
print("test Data shape: ", data_test_cpu.shape)
# Transfer to GPU as PyTorch tensors
labels_test = torch.tensor(labels_test_cpu, dtype=torch.float32, device=device)
data_test = torch.tensor(data_test_cpu, dtype=torch.float32, device=device)

test Labels shape: (100,)
test Data shape:  (100, 1460)


In [6]:
#  Verify there are no missing values
print("\nAny missing values in the train dataset?", df_train.isnull().any().any())
print("\nAny missing values in the test dataset?", df_test.isnull().any().any())


Any missing values in the train dataset? False

Any missing values in the test dataset? False


In [7]:
print(data_train[:5].detach().cpu().numpy())  # First 5 rows of train data
print(data_test[:5].detach().cpu().numpy())   # First 5 rows of test data


[[-0.58475375 -0.58475375  1.730991   ... -0.5786034   1.7327257
  -0.584734  ]
 [-0.59143436 -0.51110417  1.7268198  ... -0.5807305   1.7273961
  -0.5807305 ]
 [-0.57794535 -0.57794535  1.7307931  ... -0.5497977   1.7347268
  -0.5777511 ]
 [-0.5889254  -0.53808755  1.7357183  ... -0.57648265  1.7436645
  -0.5888756 ]
 [-0.5966332  -0.532188    1.7180674  ... -0.57515806  1.7432584
  -0.5924034 ]]
[[-0.577967   -0.577967    1.7381622  ... -0.577967    1.7167429
  -0.577967  ]
 [-0.58857465 -0.58857465  1.7231334  ... -0.55418974  1.7424324
  -0.5884301 ]
 [-0.58289725 -0.573761    1.7534016  ... -0.5534375   1.72404
  -0.58268136]
 [-0.59095055 -0.5374745   1.7439328  ... -0.5755263   1.7281231
  -0.59089124]
 [-0.5768208  -0.5636649   1.7305657  ... -0.5778275   1.73903
  -0.5778275 ]]


In [8]:
def deterministic_train_test_split_gpu(data, labels, step=10, run=0, val_ratio=0.2):
    """
    Deterministic split for PyTorch tensors: select validation indices spaced by 'step' to achieve 'val_ratio' split.
    """
    n = data.shape[0] 
    block_size = int(torch.ceil(torch.tensor(n * val_ratio)).item())
    start = run * block_size * step
    val_idx = (start + torch.arange(block_size, device=data.device) * step) % n
    train_mask = torch.ones(n, dtype=torch.bool, device=data.device)
    train_mask[val_idx] = False
    train_idx = torch.arange(n, device=data.device)[train_mask]
    data_training = data[train_idx]
    data_validation = data[val_idx]
    labels_training = labels[train_idx]
    labels_validation = labels[val_idx]
    return data_training, data_validation, labels_training, labels_validation

In [9]:
def pairwise_otw_smooth_signed(X1, X2, m, s, beta):
    """
    Compute OTW distance between all pairs in X1 (N1, L) and X2 (N2, L).
    Returns: (N1, N2) distance matrix.
    """
    N1, L = X1.shape
    N2, _ = X2.shape

    # Expand X1 and X2 for broadcasting
    X1_exp = X1.unsqueeze(1)  # (N1, 1, L)
    X2_exp = X2.unsqueeze(0)  # (1, N2, L)

    # Smooth absolute value function
    def smooth_abs(x, beta):
        return torch.where(torch.abs(x) < beta, (x ** 2) / (2 * beta), torch.abs(x) - beta / 2)

    # Split positive/negative parts
    def split_pos_neg(x):
        x_pos = torch.clamp(x, min=0)
        x_neg = torch.clamp(-x, min=0)
        return x_pos, x_neg

    # OTW computation for positive or negative parts
    def otw_smooth(a, b, m, s, beta):
        # Cumulative sums
        A = torch.cumsum(a, dim=-1)
        B = torch.cumsum(b, dim=-1)
        n = a.shape[-1]
        A_shift = torch.cat([torch.zeros_like(A[..., :s]), A[..., :-s]], dim=-1)
        B_shift = torch.cat([torch.zeros_like(B[..., :s]), B[..., :-s]], dim=-1)
        As = A - A_shift
        Bs = B - B_shift
        diffs = smooth_abs(As[..., 1:] - Bs[..., 1:], beta).sum(dim=-1)
        if n - 1 >= s:
            An = A[..., -1] - A[..., -1 - s]
            Bn = B[..., -1] - B[..., -1 - s]
        else:
            An = A[..., -1] - 0.0
            Bn = B[..., -1] - 0.0
        return m * smooth_abs(An - Bn, beta) + diffs

    # Split into positive and negative parts
    X1_pos, X1_neg = split_pos_neg(X1_exp)
    X2_pos, X2_neg = split_pos_neg(X2_exp)

    # Compute OTW for positive and negative parts
    otw_pos = otw_smooth(X1_pos, X2_pos, m, s, beta)
    otw_neg = otw_smooth(X1_neg, X2_neg, m, s, beta)

    return otw_pos + otw_neg  

In [10]:
m = 1
s = 5
beta = 0.1

In [11]:
# 10 time training and validation split and average the error
error_list = []
for k in range(10):
    print(k)
    data_training, data_validation, labels_training, labels_validation = deterministic_train_test_split_gpu(
        data_train, labels_train, step=8, run=k, val_ratio=0.2
    )

    dists = pairwise_otw_smooth_signed(data_validation, data_training, m, s, beta) 
    nn_indices = torch.argmin(dists, dim=1)
    preds = labels_training[nn_indices].cpu().numpy()
    error = np.mean(preds != labels_validation.cpu().numpy())
    error_list.append(error)

0
1
2
3
4
5
6
7
8
9


In [12]:
error_list = np.array(error_list, dtype=np.float64)
print("OTW classification validation error rates over 10 runs:", error_list)
n = len(error_list)
mean_error = np.mean(error_list)
std_error = np.std(error_list)
margin_error = 1.96 * std_error / np.sqrt(n)  # 95% confidence interval
print(f"OTW classification validation error rate: {mean_error:.2f} ± {margin_error:.2f}")

OTW classification validation error rates over 10 runs: [0.35 0.35 0.25 0.35 0.45 0.35 0.35 0.25 0.35 0.45]
OTW classification validation error rate: 0.35 ± 0.04


In [13]:
# Final evaluation on the test set
preds = []
for i in range(data_test.shape[0]):
    if i % 10 == 0:
        print(f"Processing test sample {i}/{data_test.shape[0]}")
    Q = data_test[i:i+1]
    dists = pairwise_otw_smooth_signed(Q, data_train, m, s, beta)
    nn_idx = torch.argmin(dists, dim=1)
    pred = labels_train[nn_idx].cpu().numpy()[0]
    preds.append(pred) 
preds = np.array(preds)
# Compute error
error = np.mean(preds != labels_test.cpu().numpy())
print("OTW classification test error rate on the test set: {:.2f}".format(error))

Processing test sample 0/100
Processing test sample 10/100
Processing test sample 20/100
Processing test sample 30/100
Processing test sample 40/100
Processing test sample 50/100
Processing test sample 60/100
Processing test sample 70/100
Processing test sample 80/100
Processing test sample 90/100
OTW classification test error rate on the test set: 0.35
