# Geolife Checks

In this notebook, we will import and initialise all models to ensure that they are working correctly on the **Geolife Dataset**!
## Initialisation

In [None]:
GPU = 0
%cd ..
import logging

import numpy as np
%load_ext autoreload
%autoreload 2

logging.basicConfig(level=logging.INFO)
# Print Python Version & PyTorch version
import torch
import sys
import os
print(f"Python version\t=\t{sys.version}\nPyTorch version\t=\t{torch.__version__}")
# Make torch deterministic
torch.manual_seed(0)

In [None]:
RunningInCOLAB = 'google.colab' in str(get_ipython())
if RunningInCOLAB:
    # Move to default colab folder
    %cd /content
    # Check if repository is already cloned
    if not os.path.isdir("stg"):
        # Clone repository
        !git clone {config.GITHUB_URL} {config.MODULE_NAME}
    # Change to repository directory
    %cd {config.MODULE_NAME}
    # Only install requirements not already installed by Colab
    # !pip install opacus
    # SLOW: Only execute the following line if you encounter an error regarding a package not being installed
    # !pip install -r requirements.txt
else:
    import sys
    # Add parent directory (absolute!) to path
    sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

## Geolife Dataset

We use the Geolife dataset as a sample dataset to test the models.

In [None]:
from conv_gan.datasets import get_dataset, Datasets, ZeroPadding

BATCH_SIZE = 64

ds = get_dataset(Datasets.GEOLIFE, return_labels=True)
# Print Shape of one sample
print(f"Sample:\t({len(ds[0][0])}, {ds[0][0][0].shape}")
print(f"Number of Trajecotries:\t{len(ds)}")
   
from torch.utils.data import DataLoader
padding_fn = ZeroPadding(
    return_len=True,
    return_labels=True,
    feature_first=True,
)
dl = DataLoader(ds, batch_size=BATCH_SIZE, collate_fn=padding_fn)
# Print Shape of one batch
batch, lengths, labels = next(iter(dl))
print(f"Batch:\t{len(batch)}")
print(f"Labels:\t{len(labels)}")
for i in range(len(batch)):
    print(f"Batch Feature #{i}:\t{batch[i].shape}")
print(f"Lengths:\t\t\t{len(lengths)}: {lengths}")

In [None]:
from conv_gan.utils.visualise import plot_pointclouds

# Print Point Cloud
TRAJ_NUM = 1000
original_gl_samples = [ds[i][0][0] for i in np.random.randint(0, len(ds), TRAJ_NUM)]
points = torch.cat(original_gl_samples, dim=0).view(-1, 2).cpu().numpy()
print(points.shape)
_ = plot_pointclouds(points)

## Noise-TrajGAN: Baseline Model

In [None]:
# Constants
FEATURES = ds.features
LATENT_DIM = 256
NOISE_DIM = 28
# Training Parameters
# Choose epochs such that we have around 10k steps total
EPOCHS = 10000 // len(dl) + 1
print(f"Epochs:\t{EPOCHS}\nSteps:\t{EPOCHS * len(dl)}")

WGAN = True
LP = True  # Lipschitz Penalty required!
LR_G = 0.0001
LR_D = 0.0001
N_CRITIC = 5

In [None]:
# Import Noise-TrajGAN
from conv_gan.models.noise_trajgan import Noise_TrajGAN

name = f'NTG_GL_G{LR_G}_{N_CRITIC}xD{LR_D}_L{LATENT_DIM}_N{NOISE_DIM}_B{BATCH_SIZE}_{"WGAN" if WGAN else "GAN"}'

# Create a Noise-TrajGAN model
ntg = Noise_TrajGAN(
    features=FEATURES,
    latent_dim=LATENT_DIM,
    noise_dim=NOISE_DIM,
    wgan=WGAN,
    gradient_penalty=LP,
    lipschitz_penalty=LP,
    lr_g=LR_G,
    lr_d=LR_D,
    gpu = 0,
    name=name,
)
# Print and compare feature number of generator and discriminator
count_params_torch = lambda model: sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Generator Parameters:\t\t{count_params_torch(ntg.gen)}")
print(f"Discriminator Parameters:\t{count_params_torch(ntg.dis)}")
print(f"Relationship [G / D]:\t\t{count_params_torch(ntg.gen) / count_params_torch(ntg.dis) * 100 :.2f}%")

In [None]:
# Print an initial output of NTG
fake = ntg.generate(1000, 28)
# print Results
for i, feature in enumerate(fake):
    print(f"Feature {i}:\t{len(feature), len(feature[0]), len(feature[0][0])}")
latlon = np.array(fake[0])
# Reshape to (-1, 2)
points = latlon.reshape(-1, 2)
print(points.shape)
plot_pointclouds(points)

In [None]:
# Train model
ntg.training_loop(dl, epochs=EPOCHS, dataset_name=Datasets.GEOLIFE, n_critic=N_CRITIC, plot_freq=200, save_freq=-1, tensorboard=True, notebook=True)

In [None]:
# Print an initial output of NTG
fake = ntg.generate(1000, 28)
# print Results
for i, feature in enumerate(fake):
    print(f"Feature {i}:\t{len(feature), len(feature[0]), len(feature[0][0])}")
latlon = np.array(fake[0])
# Reshape to (-1, 2)
points = latlon.reshape(-1, 2)
print(points.shape)
plot_pointclouds(points)

## Noise-TrajGAN with Differential Privacy

In [None]:
# DP Parameters
EPSILON = 10.0
ACCOUNTANT = 'prv'  # Default is 'prv', but I found that 'rdp' is more stable in some situation 
MAX_GRAD_NORM = 0.1
# Delta should be 1/n where n is the number of samples according to DPfy-ML
DELTA = 1 / len(ds)
print(f"Epsilon:\t{EPSILON:.1f}\nDelta:\t\t{DELTA:.2e}\nMax Grad Norm:\t{MAX_GRAD_NORM}\nAccountant:\t{ACCOUNTANT}")


DP_IN_DIS = False  # Whether to apply DP-SGD to discriminator or generator
# WGAN Clipping does not work if DP is applied to the discriminator
LP = not DP_IN_DIS
WGAN = True
if not DP_IN_DIS and N_CRITIC > 1:
    print("Warning: Training with DP and N_CRITIC is a bit of a gamble because we might actually be wasting privacy budget on the discriminator which does not even uses DP.")

# Increase learning rate of DP model to make up for the gradient clipping
if DP_IN_DIS:
    LR_D = LR_D / MAX_GRAD_NORM 
else:
    LR_G = LR_G / MAX_GRAD_NORM

print(f"LR_G:\t{LR_G}\nLR_D:\t{LR_D}")

# Create new DataLoader
# The number of steps should be the same as without DP, but DP-SGD works better for large batches
# --> Increase batches and epochs by same factor
FACTOR = 10
DP_BATCH_SIZE = BATCH_SIZE * FACTOR
DP_EPOCHS = EPOCHS * FACTOR
dp_dl = DataLoader(ds, batch_size=DP_BATCH_SIZE, collate_fn=padding_fn)
print(f"Batch Size:\t{DP_BATCH_SIZE}\nEpochs:\t\t{DP_EPOCHS}\nSteps:\t\t{DP_EPOCHS * len(dp_dl)}")

In [None]:
# Initialize DP-Noise-TrajGAN
name = f'DP-NTG_GL_G{LR_G}_{N_CRITIC}xD{LR_D}_L{LATENT_DIM}_N{NOISE_DIM}_B{DP_BATCH_SIZE}_C{MAX_GRAD_NORM}'

dp_ntg = Noise_TrajGAN(
    features=FEATURES,
    latent_dim=LATENT_DIM,
    noise_dim=NOISE_DIM,
    lr_g=LR_G,
    lr_d=LR_D,
    gpu = 0,
    name=name,
    wgan=WGAN,
    gradient_penalty=LP,
    lipschitz_penalty=LP,
    dp=True,
    dp_in_dis=DP_IN_DIS,
    privacy_accountant=ACCOUNTANT
)

In [None]:
# Initialize DP --> Returns DP dataloader
dp_dl = dp_ntg.init_dp(
    dataloader=dp_dl,
    epochs=DP_EPOCHS,
    max_grad_norm=MAX_GRAD_NORM,
    target_epsilon=EPSILON,
    delta=DELTA,
)

In [None]:
# Print an initial output of NTG
fake = ntg.generate(1000, 28)
# print Results
for i, feature in enumerate(fake):
    print(f"Feature {i}:\t{len(feature), len(feature[0]), len(feature[0][0])}")
latlon = np.array(fake[0])
# Reshape to (-1, 2)
points = latlon.reshape(-1, 2)
print(points.shape)
plot_pointclouds(points)

In [None]:
# Train the DP Model
dp_ntg.training_loop(dp_dl, epochs=DP_EPOCHS, dataset_name=Datasets.GEOLIFE, n_critic=N_CRITIC, plot_freq=200, save_freq=-1, tensorboard=True, notebook=True)

In [None]:
# Print an initial output of NTG
fake = ntg.generate(1000, 28)
# print Results
for i, feature in enumerate(fake):
    print(f"Feature {i}:\t{len(feature), len(feature[0]), len(feature[0][0])}")
latlon = np.array(fake[0])
# Reshape to (-1, 2)
points = latlon.reshape(-1, 2)
print(points.shape)
plot_pointclouds(points)

In [None]:
# Print an initial output of NTG
fake = ntg.generate(1000, 28)
# print Results
for i, feature in enumerate(fake):
    print(f"Feature {i}:\t{len(feature), len(feature[0]), len(feature[0][0])}")
latlon = np.array(fake[0])
# Reshape to (-1, 2)
points = latlon.reshape(-1, 2)
print(points.shape)
plot_pointclouds(points)# Print resulting privacy loss
print("Final Delta:\t", dp_ntg.delta)
print("Final Epsilon:\t", dp_ntg.epsilon)

## Baseline CNN-GAN

In [None]:
# Import CNN-GAN
import config
from conv_gan.models.dcgan import DCGan
import pandas as pd
from sklearn.model_selection import KFold

# It is really simple to run the model but it could be made even more simple if required... 
# For more info on these hyperparameters look at 5-fold.py in the root directory.
opt = {
        "file":'geolife', # This specifically is used for saving files - but also for 
        "epochs":200, 
        "batch_size":64, 
        "lr":0.0002, 
        "g_factor":1.0, 
        "b1":0.5, 
        "evaluate":False, 
        "b2":0.999, 
        "n_cpu":8, 
        "latent_dim":100, 
        "img_size":24, 
        "channels":1, 
        "sample_interval":1000, 
        "load_params":0, 
        "g":0, 
        "schedule":None, 
        "plot_points":False
    }
if opt['file'] == "geolife":
    data =  pd.read_csv(config.BASE_DIR + "data/geolife/restricted_geolife.csv")
else:   
    data = pd.read_csv(config.BASE_DIR + "data/fs_nyc/restricted_foursquare.csv")

# Get all the trajectories in the format I want (maximum length is 144)
# In theory the pre-processing should prevent larger sized trajectories, but we do this just incase.
trajectories = [traj.values.tolist()[:144] for tid, traj in data.groupby('tid') ]

# Split data into 2/3 (train) and 1/3 (test)
kf = KFold(n_splits=3)
train, test = list(kf.split(trajectories))[0]

dcgan = DCGan(opt, mainData=[trajectories[i] for i in train], testData=[trajectories[i] for i in test],fold=0)
# dcgan.training_loop()

## DP CNN-GAN

In [None]:
LR_D = opt["lr"]
LR_G = opt["lr"]

# DP Parameters
EPSILON = 10.0
# Delta should be 1/n where n is the number of samples according to DPfy-ML
DELTA = 1 / len(ds)
print(f"Epsilon:\t{EPSILON:.1f}\nDelta:\t\t{DELTA:.2e}")
ACCOUNTANT = 'prv'  # Default is 'prv', but I found that 'rdp' is more stable
MAX_GRAD_NORM = 1.0

DP_IN_DIS = False  # Whether to apply DP-SGD to discriminator or generator
# Gradient Clipping does not work if DP is applied to the discriminator
LP = not DP_IN_DIS
WGAN = True

# Increase learning rate of DP model to make up for the gradient clipping
if DP_IN_DIS:
    LR_D = LR_D / MAX_GRAD_NORM 
else:
    LR_G = LR_G / MAX_GRAD_NORM

print(f"LR_G:\t\t{LR_G}\nLR_D:\t\t{LR_D}")

In [None]:
# Import CNN-GAN
import config
from conv_gan.models.dcgan import DCGan
import pandas as pd
from sklearn.model_selection import KFold

# It is really simple to run the model but it could be made even more simple if required... 
# For more info on these hyperparameters look at 5-fold.py in the root directory.
opt = {
        "file":'geolife', # This specifically is used for saving files - but also for 
        "n_epochs":200, 
        "batch_size":64, 
        "lr":0.0002, 
        "g_factor":1.0, 
        "b1":0.5, 
        "evaluate":False, 
        "b2":0.999, 
        "n_cpu":8, 
        "latent_dim":100, 
        "img_size":24, 
        "channels":1, 
        "sample_interval":1000, 
        "load_params":0, 
        "g":0, 
        "schedule":None, 
        "plot_points":False
    }
if opt['file'] == "geolife":
    data =  pd.read_csv(config.BASE_DIR + "data/geolife/restricted_geolife.csv")
else:   
    data = pd.read_csv(config.BASE_DIR + "data/fs_nyc/restricted_foursquare.csv")

# Get all the trajectories in the format I want (maximum length is 144)
# In theory the pre-processing should prevent larger sized trajectories, but we do this just incase.
trajectories = [traj.values.tolist()[:144] for tid, traj in data.groupby('tid') ]

# Split data into 2/3 (train) and 1/3 (test)
kf = KFold(n_splits=3)
train, test = list(kf.split(trajectories))[0]

dcgan = DCGan(
    opt,
    mainData=[trajectories[i] for i in train],
    testData=[trajectories[i] for i in test],
    fold=0,
    dp=True,
    gpu=1,
)