<a href="https://colab.research.google.com/github/kennywchen/ch-rampup/blob/main/data_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# had issues running locally so using Colab Notebook

!pip install ydata-synthetic==1.1.0

In [None]:
import torch
import numpy as np
import torch.nn as nn
import pandas as pd
import torch.jit
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict
from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters


class TabularDataset(Dataset):
    def __init__(self, X, y, model_parameters):
        self.sample_count = X.shape[0]
        self.num_X = torch.tensor(X[:, model_parameters.num_features_idxs].astype(np.float32))
        self.cat_X = torch.tensor(X[:, model_parameters.cat_features_idxs].astype(np.int64))
        self.emb_X = torch.tensor(X[:, model_parameters.emb_features_idxs].astype(np.float32))
        self.y = torch.as_tensor(y.astype(np.float32))
        self.model_parameters = model_parameters

    def shuffle_and_transfer(self):
        p = np.random.permutation(self.sample_count)
        self.num_X_dev = torch.as_tensor(self.num_X[p])
        self.cat_X_dev = torch.as_tensor(self.cat_X[p])
        self.emb_X_dev = torch.as_tensor(self.emb_X[p])
        self.y_dev = torch.as_tensor(self.y[p])

    def __len__(self):
        return int((self.sample_count + self.model_parameters.BATCH_SIZE - 1) / self.model_parameters.BATCH_SIZE)

    def __getitem__(self, idx):
        start = idx * self.model_parameters.BATCH_SIZE
        if start + self.model_parameters.BATCH_SIZE < self.sample_count:
            return [
                self.num_X_dev[start : start + self.model_parameters.BATCH_SIZE],
                self.cat_X_dev[start : start + self.model_parameters.BATCH_SIZE],
                self.emb_X_dev[start : start + self.model_parameters.BATCH_SIZE],
                self.y_dev[start : start + self.model_parameters.BATCH_SIZE],
            ]
        else:
            return [self.num_X_dev[start:], self.cat_X_dev[start:], self.emb_X_dev[start:], self.y_dev[start:]]


class ModelParameters2(object):
    def __init__(self):
        self.BATCH_SIZE = 128 * 1024
        self.NUMERIC_FEATURES_COUNT = 4
        self.CATEGORICAL_FEATURES_COUNT = 2
        self.EMBEDDING_DIM_0 = 4
        self.EMBEDDING_DIM_1 = 2
        self.EMBEDDING_FEATURES_COUNT = self.EMBEDDING_DIM_0 + self.EMBEDDING_DIM_1
        self.SAMPLES = 1 * self.BATCH_SIZE
        self.NUM_FEATURE_START = 0
        self.CAT_FEATURE_START = self.NUMERIC_FEATURES_COUNT
        self.EMB_FEATURE_START = self.CAT_FEATURE_START + self.CATEGORICAL_FEATURES_COUNT
        self.TOTAL_COLUMNS = self.EMB_FEATURE_START + self.EMBEDDING_DIM_0 + self.EMBEDDING_DIM_1
        self.num_features_idxs = list(range(0, self.NUMERIC_FEATURES_COUNT))
        self.cat_features_idxs = list(range(self.CAT_FEATURE_START, self.EMB_FEATURE_START))
        self.emb_features_idxs = list(range(self.EMB_FEATURE_START, self.TOTAL_COLUMNS))


class ToyModel(nn.Module):
    def __init__(self, model_parameters):
        super().__init__()

        self.cnt_num_dim = model_parameters.NUMERIC_FEATURES_COUNT
        self.cnt_cat_emb_dim = sum([3] * model_parameters.CATEGORICAL_FEATURES_COUNT)
        self.cnt_emb_dim = model_parameters.EMBEDDING_FEATURES_COUNT
        self.num_bn_layer = nn.BatchNorm1d(self.cnt_num_dim)
        self.emb_layers = nn.ModuleList([nn.Embedding(1000, 3) for f in model_parameters.cat_features_idxs])
        self.sequence = nn.Sequential(
            nn.Linear(self.cnt_num_dim + self.cnt_cat_emb_dim + self.cnt_emb_dim, 50),
            nn.BatchNorm1d(50),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(50, 10),
            nn.BatchNorm1d(10),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(10, 1),
        )

    def forward(self, num_x, cat_x, emb_x):
        normalized_cont_data = self.num_bn_layer(num_x)
        arr0 = self.emb_layers[0] # nn.embedding(10, 3) -> range (0-9) 3 dimensions
        cat = cat_x[:, 0] # one column of the categorical variables len sample size
        arr00 = arr0(cat) # some 3d array

        cat_emb_op = torch.cat([emb_layer(cat_x[:, i]) for i, emb_layer in enumerate(self.emb_layers)], dim=1)
        x = torch.cat([normalized_cont_data, cat_emb_op, emb_x], dim=1)
        x = self.sequence(x)
        return x


'''
Function to test. Saves model. Will be saved by MLEs.
'''
def save_model(model_name, dataset):
    # model.eval()

    num_x, cat_x, emb_x, y = dataset[0]
    input = (num_x, cat_x, emb_x)
    model_parameters = ModelParameters2()
    toy_model = ToyModel(model_parameters).float()
    traced_model = torch.jit.trace(toy_model, input)
    traced_model.save(model_name)
    print("saved")


'''
training data
features 1-2 are normal
features 3-5 are skewed
'''
def long_tail_dataset(sample_size):
    f1 = np.random.normal(15, 10, sample_size)
    f2 = np.random.normal(-100, 3, sample_size)
    f3 = np.random.exponential(scale = 1, size=sample_size)
    f4 = np.random.poisson(lam = 5, size = sample_size)
    f5= np.random.exponential(scale = 2, size = sample_size)
    f6 = np.random.normal(5, 2, sample_size)
    f7 = np.random.normal(4, 1.5, sample_size)
    f8 = np.random.exponential(scale = .5, size = sample_size)
    f9 = np.random.poisson(lam = 2, size = sample_size)
    f10 = np.random.poisson(lam = 1, size = sample_size)
    f11 = np.random.poisson(lam = 2, size = sample_size)
    f12 = np.random.poisson(lam = 1, size = sample_size)

    features = [f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12]

    for i in range(4, 12):
        features[i] = [int(min(10, max(0, f))) for f in features[i]]

    data = np.transpose(np.array(features))
    return torch.from_numpy(data)



#### above functions are solely for testing purposes ###

In [None]:
'''
@param data (Pytorch Tensor)
@param model_parameters (ModelParameters2)
Creates similarly distributed synthetic dataset
'''
def synth_data_gen2(data, model_parameters):

  temp = data.numpy()
  data = pd.DataFrame(temp)

  num_cols = [f'num_{i}' for i in model_parameters.num_features_idxs]
  cat_cols = [f'cat_{i}' for i in model_parameters.cat_features_idxs]
  emb_cols = [f'emb_{i}' for i in model_parameters.emb_features_idxs]

  names = num_cols + cat_cols + emb_cols

  new_names = {}

  for i, n in enumerate(names):
    new_names[i] = n

  data.rename(columns = new_names, inplace=True)

  batch_size = 500
  epochs = 1
  learning_rate = 2e-4
  beta_1 = 0.5
  beta_2 = 0.9
  ctgan_args = ModelParameters(batch_size=batch_size,
                            lr=learning_rate,
                            betas=(beta_1, beta_2))
  train_args = TrainParameters(epochs=epochs)
  synth = RegularSynthesizer(modelname='ctgan', model_parameters=ctgan_args)
  synth.fit(data=data, train_arguments=train_args, num_cols=num_cols, cat_cols=cat_cols + emb_cols)
  synth_data = synth.sample(1000)

  return synth_data



'''
@param model_name (String)
@param torch_dataset (Tabular Dataset)
@param model_parameters (ModelParameters2 Object)
Performs generates synthetic data + runs test
'''
# runs synthetic data test on given dataset
def y_data_synth_test(model_name, model_parameters, expected_range, dataset):

  #1. load model-- COMMENT FOR TESTING
  model = torch.jit.load(model_name)

  y_data_temp = synth_data_gen2(dataset, model_parameters)
  synth_dataset = y_data_temp.values

  rows = synth_dataset.shape[0]
  y = np.random.uniform(low = 0, high = 0, size = (rows, 1))

  tab_data = TabularDataset(synth_dataset, y, model_parameters)
  tab_data.shuffle_and_transfer()

  #### TESTING PURPOSES ####
  # save_model(model_name, tab_data)

  # model = torch.jit.load(model_name)
  ##########################

  gather_test_stats(model, tab_data, expected_range)



'''
@param model (Loaded in Model)
@param tab_data (Tabular Dataset)
@param exp_range (List)
Gathers statistics for each iteration of running the model
'''
def gather_test_stats(model, tab_data, exp_range):
  testingIterations = 10
  numSuccess = 0
  succeeded = []
  averages = []
  sds = []
  errors = {}
  errorValues = {}
  avgSuccessRangeLower = []
  avgSuccessRangeUpper = []
  avgFailLowerL = []
  avgFailUpperL = []
  avgFailLowerU = []
  avgFailUpperU = []

  for i in range(testingIterations):
      res, output, sRange, fRangeL, fRangeU, upperFails, lowerFails = run_model(model, tab_data, exp_range)
      avgSuccessRangeLower.append(sRange[0])
      avgSuccessRangeUpper.append(sRange[1])
      avgFailLowerL.append(fRangeL[0])
      avgFailUpperL.append(fRangeL[1])
      avgFailLowerU.append(fRangeU[0])
      avgFailUpperU.append(fRangeU[1])
      averages.append(torch.mean(output).item())
      sds.append(np.std(output.tolist()))
      if res:
          numSuccess += 1
          succeeded.append(i)
      else:
          errors["Test " + str(i)] = output

  print()
  print("Synthetic Test Results: ")
  print("Success: ", numSuccess, "/", testingIterations)
  print("Average Output: ", np.mean(averages))
  print("Average Output SD: ", np.mean(sds))
  print("Average Range of Valid Outputs: ", [np.sum(avgSuccessRangeLower) / testingIterations, np.sum(avgSuccessRangeUpper) / testingIterations])

  if not errors:
      print("Errors: None")
  else:
      print()
      print("Fails: ")
      print("Tests: ", [key for key in errors])
      print("Average Range of Failed Outputs (Lower): ", [np.sum(avgFailLowerL) / max(lowerFails, 1), np.sum(avgFailUpperL) / max(lowerFails, 1)])
      print("Average Range of Failed Outputs (Upper): ", [np.sum(avgFailLowerU) / max(upperFails, 1), np.sum(avgFailUpperU) / max(upperFails, 1)])



'''
@param model (Loaded in Model)
@param torch_dataset (Tabular Dataset)
@param model_parameters (ModelParameters2 Object)
Takes in a loaded model. Runs model according to model parameters
'''
def run_model(model, torch_dataset, expected_range):
  torch_dataset.shuffle_and_transfer()
  num_x, cat_x, emb_x, y = torch_dataset[0]


  pred = model(num_x, cat_x, emb_x)

  successRange = [float('inf'), -float('inf')]
  failRangeLower = [float('inf'), -float('inf')]
  failRangeUpper = [float('inf'), -float('inf')]
  numLowerFail = 0
  numUpperFail = 0
  success = True

  for i in range(pred.shape[0]):
      value = pred[i][0].item()
      if value < expected_range[0] or value > expected_range[1]:

          if value < expected_range[0]:
              failRangeLower[0] = min(failRangeLower[0], value)
              failRangeLower[1] = max(failRangeLower[1], value)
              numLowerFail += 1
          elif value > expected_range[1]:
              failRangeUpper[0] = min(failRangeUpper[0], value)
              failRangeUpper[1] = max(failRangeUpper[1], value)
              numUpperFail += 1
          success = False
      else:
          successRange[0] = min(successRange[0], value)
          successRange[1] = max(successRange[1], value)

  if failRangeLower == [float('inf'), -float('inf')]:
      failRangeLower = [0, 0]

  if failRangeUpper == [float('inf'), -float('inf')]:
      failRangeUpper = [0, 0]

  if successRange == [float('inf'), -float('inf')]:
      success = [0, 0]

  return success, pred, successRange, failRangeLower, failRangeUpper, numUpperFail, numLowerFail



'''
@param model_name (String)
@param model_parameters (ModelParameters2() Object)
@param expected_range (List)
@param dataset (Pytorch Tensor)
Checks for long tails within given dataset and performs synthetic data test on any long tails
'''
def long_tail_test(model_name, model_parameters, expected_range, dataset):

  rows = dataset.shape[0]
  num_numerical = model_parameters.NUMERIC_FEATURES_COUNT
  bin_num = 20
  long_tails = {}

  #look for tails
  for i in range(num_numerical):
    column = dataset[:, i]
    numpy_col = column.numpy()
    ___, bin_edges = np.histogram(numpy_col, bins=bin_num)
    tail_value = np.percentile(column, 99)
    bin_val = pd.cut([tail_value], bin_edges)[0].left

    bin = np.where(np.round(bin_edges, decimals=3) == bin_val)[0][0]
    threshold_percentage = 0.2

    if bin_num - bin > threshold_percentage * bin_num:
        long_tails[i] = [tail_value, max(column).item()]

  print("Detected Long Tails at features: " + str(list(long_tails.keys())))

  for col in long_tails:
    print(f"Long Tail Test For Feature: {col}")
    filtered_long_data = dataset[dataset[:, col] >= long_tails[col][0]]

    y_data_synth_test(model_name, model_parameters,[-0.5, 0.5], dataset)
    print("\n\n")



'''
@param model_name (String)
@param dataset (Pytorch Tensor)
@param expected_range (List)
Splits given dataset into validation and training and performs test on validation
'''
def validation_data_test(model_name, dataset, expected_range):

  model = torch.jit.load(model_name)

  validation_data, training_data = torch.utils.data.random_split(dataset, [0.2, 0.8])
  gather_test_stats(model, validation_data, expected_range)

  return



'''
@param model_name (String)
@param model_parameters (ModelParameters2() Object)
@param expected_range (List)
Performs a data test using a uniform wide range of values
'''
def uniform_synth_data_test(model_name, model_parameters, expected_range):

  #1. load model COMMENT FOR TESTING
  model = torch.jit.load(model_name)

  # 2. use model config to count the number of numerical, categorical, and embeding features
  numNumerical = model_parameters.NUMERIC_FEATURES_COUNT
  numCategorical = model_parameters.CATEGORICAL_FEATURES_COUNT

  numEmbedding = model_parameters.EMBEDDING_FEATURES_COUNT

  totalCols = numNumerical + numCategorical + numEmbedding

  # 3. create a range of data to test on as well as determine # of samples to take
  lower, upper = -float(1000000000), float(1000000000)

  #specify ranges instead of default [0,1) -> pass into TabularDataset constructor
  NUM_SAMPLES = 128 * 1024


  # SET TO 10 FOR TESTING
  embedding_num = len(model.state_dict()['emb_layers.0.weight']) # vocabulary size of cat variables

  # 4. create sample inputs NUM_SAMPLES times
  X = np.array([]).reshape(0, totalCols)
  for _ in range(NUM_SAMPLES):
      num = np.random.uniform(low = lower, high = upper, size = (1,numNumerical))
      cat = np.random.uniform(low = 0, high = embedding_num, size = (1,numCategorical))
      embed = np.random.uniform(low = lower, high = upper, size = (1,numEmbedding))

      X = np.append(X, np.hstack((num, cat, embed)), axis = 0)

  y = np.random.uniform(low = lower, high = upper, size = (NUM_SAMPLES, 1))

  dataset = TabularDataset(X, y, model_parameters)

  dataset.shuffle_and_transfer()

  #### FOR TESTING PURPOSES ####
  # save_model(model_name, dataset)

  # model = torch.jit.load(model_name)
  ##############################

  gather_test_stats(model, dataset, expected_range)








# Example of using data generation

def main_test():

  # Example Parameters
  model_parameters = ModelParameters2()
  model_name = "model.pt"
  dataset = long_tail_dataset(1000)
  expected_range = [-2, 1]



  ### Regular Synthetic-Data Test ###
  y_data_synth_test(model_name, model_parameters, expected_range, dataset)

  ### Long Tails Test ###
  long_tail_test(model_name, model_parameters, expected_range, dataset)

  ### Random Uniform Data Test ###
  uniform_synth_data_test(model_name, model_parameters, expected_range)

  ### Validation Test ###
  validation_data_test(model_name, dataset, expected_range)


main_test()


Detected Long Tails at features: [2]
Long Tail Test For Feature: 2
Epoch: 0 | critic_loss: 5.42899751663208 | generator_loss: 2.126983880996704
saved

Synthetic Test Results: 
Success:  0 / 10
Average Output:  0.16092152297496795
Average Output SD:  0.36658636764455316
Average Range of Valid Outputs:  [-0.4956326484680176, 0.4982394605875015]

Fails: 
Tests:  ['Test 0', 'Test 1', 'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Test 6', 'Test 7', 'Test 8', 'Test 9']
Average Range of Failed Outputs (Lower):  [-0.354541015625, -0.14477730478559223]
Average Range of Failed Outputs (Upper):  [0.0325180675302233, 0.10247236335432375]





Tensor-likes are not close!

Mismatched elements: 1000 / 1000 (100.0%)
Greatest absolute difference: 1.386189341545105 at index (360, 0) (up to 1e-05 allowed)
Greatest relative difference: 273.45195752862014 at index (83, 0) (up to 1e-05 allowed)
  _check_trace(
