In [None]:
import logging

import numpy as np
import ot

import config
%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)
# Print Python Version & PyTorch version
import torch
import sys
import os
print(f"Python version\t=\t{sys.version}\nPyTorch version\t=\t{torch.__version__}")
# Make torch deterministic
torch.manual_seed(0)

In [None]:
RunningInCOLAB = 'google.colab' in str(get_ipython())
if RunningInCOLAB:
    # Move to default colab folder
    %cd /content
    # Check if repository is already cloned
    if not os.path.isdir("stg"):
        # Clone repository
        !git clone {config.GITHUB_URL} {config.MODULE_NAME}
    # Change to repository directory
    %cd {config.MODULE_NAME}
    # Only install requirements not already installed by Colab
    # !pip install opacus
    # SLOW: Only execute the following line if you encounter an error regarding a package not being installed
    # !pip install -r requirements.txt
else:
    import sys
    # Add parent directory (absolute!) to path
    sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
# Generate two distributions of 2D points
import matplotlib.pyplot as plt

n_samples = 100000
n_features = 2

S = np.random.randn(n_samples, n_features)
T = np.random.randn(n_samples, n_features) + (2, 1)
U = np.random.randn(n_samples, n_features) + (4, 2)


# smaller markersize
plt.scatter(S[:, 0], S[:, 1], label='S', s=1)
plt.scatter(T[:, 0], T[:, 1], label='T', s=1)
plt.scatter(U[:, 0], U[:, 1], label='U', s=1)
plt.legend()
plt.show()

In [None]:
from timeit import default_timer as timer
# The best case is the sliced WD, but very slow
start = timer()
swd_st = ot.sliced_wasserstein_distance(S, T)
end = timer()
print(f"Sliced Wasserstein distance between S and T: {swd_st:.2f} in {end-start:.2f} seconds")

swd_ss = ot.sliced_wasserstein_distance(S, S)
swd_su = ot.sliced_wasserstein_distance(S, U)

print(f"Sliced Wasserstein distance between S and S: {swd_ss:.2f}")
print(f"Sliced Wasserstein distance between S and U: {swd_su:.2f}")

In [None]:
# Now, we repeat by sampling 10k points from each distribution as compare
n_samples = 10000
S_small = S[np.random.choice(S.shape[0], n_samples, replace=False)]
T_small = T[np.random.choice(T.shape[0], n_samples, replace=False)]
U_small = U[np.random.choice(U.shape[0], n_samples, replace=False)]

swd_st_small = ot.sliced_wasserstein_distance(S_small, T_small)
swd_ss_small = ot.sliced_wasserstein_distance(S_small, S_small)
swd_su_small = ot.sliced_wasserstein_distance(S_small, U_small)

print(f"Sliced Wasserstein distance between S and T: {swd_st_small:.2f};\t Difference: {swd_st_small-swd_st:.2f}")
print(f"Sliced Wasserstein distance between S and S: {swd_ss_small:.2f};\t Difference: {swd_ss_small-swd_ss:.2f}")
print(f"Sliced Wasserstein distance between S and U: {swd_su_small:.2f};\t Difference: {swd_su_small-swd_su:.2f}")

## Try on Foursquare how much our results will be off if we only consider 10'000 points instead of the entire dataset

In [None]:
import torch
from conv_gan.datasets import get_dataset, Datasets
# Load FS
fs_ds = get_dataset(Datasets.FS)
print("Total Trajectories:\t\t\t", len(fs_ds), "\nFeatures per Trajectory:\t", len(fs_ds[0]), "\nShape of one location traj:\t", fs_ds[0][0].shape)
# Create a point dataset
trajs = [t[0] for t in fs_ds]
# Concatenate along dim 0
points = torch.cat(trajs, dim=0).numpy()
print("Total Points:\t\t\t\t", points.shape)

In [None]:
# Compute SWD of entire dataset and of 10k points
fake = np.random.randn(points.shape[0], points.shape[1])
swd_fs = ot.sliced_wasserstein_distance(points, fake)
print(f"Sliced Wasserstein distance between FS and random: {swd_fs:.5f}")

n_samples = 10000
points_small = points[np.random.choice(points.shape[0], n_samples, replace=False)]
fake_small = fake[np.random.choice(fake.shape[0], n_samples, replace=False)]
swd_fs_small = ot.sliced_wasserstein_distance(points_small, fake_small)
print(f"Sliced Wasserstein distance between FS and random: {swd_fs_small:.5f};\t Difference: {swd_fs_small-swd_fs:.5f}")