In [39]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class WassersteinEmbeddingDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.columns = [f"hour_ratio_{str(i).zfill(2)}" for i in range(24)]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        ratio = torch.tensor(row[self.columns].values, dtype=torch.float32)
        return idx, ratio

class WassersteinEmbeddingModel(torch.nn.Module):
    def __init__(self, n_users):
        super().__init__()
        self.emb = torch.nn.Embedding(n_users, 2)
    
    def forward(self, indices):
        return self.emb(indices)
    
def wasserstein_distance(p, q):
    """
    Compute the Wasserstein distance between two distributions p and q.
    Assumes that p and q are one-dimensional tensors with the same length.
    """
    # Sort the distributions
    p_sorted, _ = torch.sort(p)
    q_sorted, _ = torch.sort(q)
    
    # Compute the cumulative distribution function (CDF)
    p_cdf = torch.cumsum(p_sorted, dim=0)
    q_cdf = torch.cumsum(q_sorted, dim=0)
    
    # Compute Wasserstein distance
    distance = torch.sum(torch.abs(p_cdf - q_cdf), dim=1)
    return distance

def euclidean_distance(batch1, batch2):
    squared_diff = (batch1 - batch2) ** 2
    sum_squared_diff = torch.sum(squared_diff, dim=1)
    distances = torch.sqrt(sum_squared_diff)
    return distances

In [17]:
df = pd.read_csv("../data/hourly_count/processed/pivoted.csv", index_col=0)

ds = WassersteinEmbeddingDataset(df)

In [19]:
dl0 = DataLoader(ds, batch_size=64, shuffle=True)
dl1 = DataLoader(ds, batch_size=64, shuffle=True)

In [40]:
model = WassersteinEmbeddingModel(len(df))

In [20]:
for d0, d1 in zip(dl0, dl1):
    break

In [21]:
d0

[tensor([24878,  6435, 11194,  5612,  2908,  4807, 42946, 15077, 16237, 18182,
         41323, 37420, 12513, 11429, 42312,  6852,  3752, 17853, 29082, 31660,
          9122,  9816, 28528,   209, 33545,  5167,  4406, 26916, 25752, 18775,
         16361, 40377, 35687, 17447, 20730, 40789, 38170,  3990, 13372, 14977,
          7243,  9184,  4062, 26933, 13234, 32267, 25828, 28455, 34327, 27250,
         24900, 25427, 21922, 36019, 29296, 33666, 36929, 17670, 19712,  2578,
         26715, 23750, 33497, 10153]),
 tensor([[0.0244, 0.0000, 0.0000,  ..., 0.0244, 0.0000, 0.0061],
         [0.0000, 0.0000, 0.0000,  ..., 0.0095, 0.0000, 0.0000],
         [0.0264, 0.0694, 0.1058,  ..., 0.0479, 0.0678, 0.0479],
         ...,
         [0.0194, 0.0278, 0.0084,  ..., 0.0618, 0.0519, 0.0487],
         [0.0227, 0.0164, 0.0216,  ..., 0.0301, 0.0137, 0.0174],
         [0.0000, 0.0000, 0.0000,  ..., 0.0043, 0.0000, 0.0000]])]

In [30]:
def wasserstein_distance(p, q):
    """
    Compute the Wasserstein distance between two distributions p and q.
    Assumes that p and q are one-dimensional tensors with the same length.
    """
    # Sort the distributions
    p_sorted, _ = torch.sort(p)
    q_sorted, _ = torch.sort(q)
    
    # Compute the cumulative distribution function (CDF)
    p_cdf = torch.cumsum(p_sorted, dim=0)
    q_cdf = torch.cumsum(q_sorted, dim=0)
    
    # Compute Wasserstein distance
    distance = torch.sum(torch.abs(p_cdf - q_cdf), dim=1)
    return distance

In [34]:
wasserstein_distance(d0[1], d1[1])

tensor([0.5462, 0.7785, 1.0075, 0.5823, 0.4319, 0.5035, 0.3988, 0.8029, 1.0421,
        1.2216, 0.8677, 0.7130, 0.6847, 0.6378, 0.8780, 0.9448, 0.8382, 0.7317,
        0.7718, 1.1588, 0.7327, 0.5961, 1.3261, 1.1092, 1.0913, 0.9915, 1.0370,
        1.2758, 1.5383, 1.3988, 0.9882, 1.2308, 1.2335, 1.1722, 1.0965, 1.8302,
        2.0927, 1.7349, 2.5175, 1.9431, 1.6751, 1.9396, 2.4090, 2.4868, 2.4286,
        1.9901, 2.0957, 2.6550, 2.2983, 2.1089, 2.1723, 2.3441, 2.8542, 2.9156,
        2.8705, 2.8644, 2.8326, 2.8056, 2.8194, 2.7843, 2.5767, 2.5734, 2.8320,
        2.2608])