In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pdb import run

In [2]:
# Load the TSV file, using header=None since there is no header row
df_train = pd.read_csv('ACSF1_TRAIN.tsv', sep='\t', header=None)
df_test = pd.read_csv('ACSF1_TEST.tsv', sep='\t', header=None)

In [3]:
#  Separate labels and data of train 
labels_train = df_train.iloc[:, 0].values        
data_train = df_train.iloc[:, 1:]     
print("train Labels shape:", labels_train.shape)
print("train Data shape: ", data_train.shape)

train Labels shape: (100,)
train Data shape:  (100, 1460)


In [4]:
#  Separate labels and data of test 
labels_test = df_test.iloc[:, 0].values        
data_test = df_test.iloc[:, 1:]     
print("test Labels shape:", labels_test.shape)
print("test Data shape: ", data_test.shape)

test Labels shape: (100,)
test Data shape:  (100, 1460)


In [5]:
#  Verify there are no missing values
print("\nAny missing values in the train dataset?", data_train.isnull().any().any())
print("\nAny missing values in the test dataset?", data_test.isnull().any().any())


Any missing values in the train dataset? False

Any missing values in the test dataset? False


In [6]:
print(data_train.head())     # First 5 rows of train data
print(data_test.head())      # First 5 rows of test data

       1         2         3         4         5         6         7     \
0 -0.584754 -0.584754  1.730991 -0.584754 -0.584754 -0.584754  1.729917   
1 -0.591434 -0.511104  1.726820 -0.580422 -0.591434 -0.511104  1.727921   
2 -0.577945 -0.577945  1.730793 -0.577945 -0.578946 -0.564882  1.731094   
3 -0.588925 -0.538088  1.735718 -0.588716 -0.589962 -0.523551  1.735619   
4 -0.596633 -0.532188  1.718067 -0.592117 -0.596633 -0.532188  1.715241   

       8         9         10    ...      1451      1452      1453      1454  \
0 -0.584754 -0.584754 -0.584754  ...  1.732726 -0.584734 -0.583729 -0.578603   
1 -0.580422 -0.591434 -0.511104  ...  1.727396 -0.580731 -0.580731 -0.580731   
2 -0.577829 -0.580956 -0.548788  ...  1.734727 -0.577751 -0.580956 -0.549798   
3 -0.588646 -0.588925 -0.524598  ...  1.743664 -0.588876 -0.586852 -0.576483   
4 -0.592117 -0.595605 -0.532188  ...  1.743258 -0.592403 -0.591524 -0.575158   

       1455      1456      1457      1458      1459      1460  
0  1

In [7]:
def deterministic_train_test_split(data, labels, step=10, run=0, val_ratio=0.2):
    """
    Deterministic split: select validation indices spaced by 'step' to achieve 'val_ratio' split.
    """
    n = len(data)
    block_size = int(np.ceil(n * val_ratio))
    start = run * block_size * step
    val_idx = [(start + i * step) % n for i in range(block_size)]
    train_idx = np.setdiff1d(np.arange(n), val_idx)
    data_training = data.iloc[train_idx].reset_index(drop=True)
    data_validation = data.iloc[val_idx].reset_index(drop=True)
    labels_training = labels[train_idx]
    labels_validation = labels[val_idx]
    return data_training, data_validation, labels_training, labels_validation


In [8]:
def pairwise_otw_smooth_signed(X1, X2, m, s, beta):
    """
    Compute pairwise OTW distances between all rows of X1 and X2.
    Returns: (N1, N2) distance matrix.
    """

    # Smooth absolute value function
    def smooth_abs(x, beta):
        if abs(x) < beta:
            return x**2 / (2 * beta)
        else:
            return abs(x) - beta/2

    # Split positive/negative parts
    def split_pos_neg(x):
        x_pos = np.maximum(x, 0)
        x_neg = np.maximum(-x, 0)
        return x_pos, x_neg

    # OTW computation for positive or negative parts
    def otw_smooth(a, b, m, s, beta):
        # Cumulative sums
        A = np.cumsum(a)
        B = np.cumsum(b)
        n = len(a)
        diffs = 0.0
        for i in range(1, n):
            As = A[i] - (A[i-s] if i >= s else 0.0)
            Bs = B[i] - (B[i-s] if i >= s else 0.0)
            diffs += smooth_abs(As - Bs, beta)
        An = A[-1] - (A[-1-s] if n-1 >= s else 0.0)
        Bn = B[-1] - (B[-1-s] if n-1 >= s else 0.0)
        return m * smooth_abs(An - Bn, beta) + diffs

    # Split into positive and negative parts
    a_pos, a_neg = split_pos_neg(X1)
    b_pos, b_neg = split_pos_neg(X2)
    
    # Compute OTW for positive and negative parts
    otw_pos = otw_smooth(a_pos, b_pos, m, s, beta)
    otw_neg = otw_smooth(a_neg, b_neg, m, s, beta)

    return otw_pos + otw_neg 
    

In [9]:
m = 1
s = 5
beta = 0.1


In [10]:
# 10 time training and validation split and average the error
error_list = []
for i in range(10):
    print(i)
    data_training, data_validation, labels_training, labels_validation = deterministic_train_test_split(
        data_train, labels_train, step=8, run=i, val_ratio=0.2
    )
    
    results = []
    for k in range(data_validation.shape[0]):
        x = data_validation.iloc[k].values
        dists = []
        for j in range(data_training.shape[0]):
            y = data_training.iloc[j].values
            d = pairwise_otw_smooth_signed(x, y, m, s, beta)
            dists.append(d)
        nn_idx = np.argmin(dists)
        pred = labels_training[nn_idx]
        results.append(int(pred))
    error_list.append(np.mean(np.array(results) != np.array(labels_validation)))

0
1
2
3
4
5
6
7
8
9


In [11]:
error_list = np.array(error_list, dtype=np.float64)
print ("OTW classification validation error rates over 10 runs:", error_list)
n = len(error_list)	
mean_error = np.mean(error_list)
std_error = np.std(error_list)
margin_error = 1.96 * std_error / np.sqrt(n) # 95% confidence interval
print(f"OTW classification validation error rate: {mean_error:.2f} ± {margin_error:.2f}")

OTW classification validation error rates over 10 runs: [0.35 0.35 0.25 0.35 0.45 0.35 0.35 0.25 0.35 0.45]
OTW classification validation error rate: 0.35 ± 0.04


In [12]:
# Final evaluation on the test set
results = []
for i in range(len(data_test)):
    if i % 10 == 0:
        print(f"Processing test sample {i}/{len(data_test)}")
    x = data_test.iloc[i].to_numpy() 
    dists = []
    for j in range(len(data_train)):
        y = data_train.iloc[j].to_numpy()
        d = pairwise_otw_smooth_signed(x, y, m, s, beta)
        dists.append(d)
    nn_idx = np.argmin(dists)
    pred = labels_train[nn_idx] 
    results.append(pred)
results = np.array(results)
# Compute error
error = np.mean(results != labels_test)
print("OTW classification test error rate on the test set: {:.2f}".format(error))

Processing test sample 0/100
Processing test sample 10/100
Processing test sample 20/100
Processing test sample 30/100
Processing test sample 40/100
Processing test sample 50/100
Processing test sample 60/100
Processing test sample 70/100
Processing test sample 80/100
Processing test sample 90/100
OTW classification test error rate on the test set: 0.35
