# Setup

## Imports

In [None]:
from importlib.metadata import version
import matplotlib.pyplot as plt
import numpy as np
from platform import python_version
import tensorflow as tf
from tensorflow.python.client import device_lib
import wandb

In [None]:
def printHeader(str):
  print(str, "-"*len(str), sep="\n")

printHeader("Versions")

print(f"Python: {python_version()}")
for module in ["matplotlib", "numpy", "tensorflow", "wandb"]:
  print(f"{module}: {version(module)}")

## Variables

### Weights & Biases

In [None]:
wandb.login()

In [None]:
wandb.init(
  # TODO: add project name
  # project="",
  config={
    "epochs": 10,
    "batch_size": 256,
  }
)

config = wandb.config

### Dangerous Parameters

These may crash your kernel/device depending on your available resources.

In [None]:
"""
Enables a *lot* of parallelism.
Speeds preprocessing & modelling up exponentially on a powerful device,
but may not work well otherwise.
"""
optimize = True

#### For comparison, this notebook was run on the following devices:

In [None]:
for local_device in device_lib.list_local_devices():
  print(local_device)

### Random

For deterministic results

In [None]:
def setRng():
  global rng
  rng = np.random.default_rng(2101432)
  return rng

rng = setRng()

def randint():
  return rng.integers(65535)

### Optimization

In [None]:
def getVisualize():
  """
  Check whether or not to run visualizations

  Does not opt out of Evaluation visualizations
  """
  try:
    return visualize
  except NameError:
    return True

def passThrough(x):
  return x

def mapOptimized(ds: tf.data.Dataset, *args, **kwargs):
  return ds.map(*args, **kwargs, num_parallel_calls=tf.data.AUTOTUNE)
def mapUnoptimized(ds: tf.data.Dataset, *args, **kwargs):
  return ds.map(*args, **kwargs)
map = mapOptimized if optimize else mapUnoptimized

def batch(ds: tf.data.Dataset, batch_size: int=config["batch_size"], *args, **kwargs):
  return ds.batch(batch_size, *args, **kwargs)
def cacheOptimized(ds: tf.data.Dataset, *args, **kwargs):
  return ds.cache(*args, **kwargs)
cache = cacheOptimized if optimize else passThrough
def prefetchOptimized(ds: tf.data.Dataset, *args, **kwargs):
  return ds.prefetch(tf.data.AUTOTUNE, *args, **kwargs)
prefetch = prefetchOptimized if optimize else passThrough

### Dataset

In [None]:
ds_seed = randint()

In [None]:
ds = tf.keras.utils. # TODO: add dataset

### Helpers

In [None]:
# Scaling Factor for Visualizations
sf = 2.5

def extractX(x, y):
  return x

def extractY(x, y):
  return y

# Exploration

## Metadata

## Central Tendency

Mean & Median

## Outliers

### UMAP

Uniform Manifold Approximation and Projection

#### Visualize

Interactive Plot

# Pre-processing

## Data Augmentation

### Oversampling

## Train - Valid -Test

Also, optimize for performance.

### Training Augmentations

### Split

In [None]:
split_shuffle_seed = randint()

In [None]:
valid_pct = .1
test_pct = .1

size = len(ds)
valid_size = np.round(valid_pct * size)
test_size = np.round(test_pct * size)
train_size = size - valid_size - test_size

ds_unbatched = ds.unbatch().shuffle(size, split_shuffle_seed)
train_ds = ds_unbatched.take(train_size)

valid_ds = ds_unbatched.skip(train_size).take(valid_size)
test_ds = ds_unbatched.skip(train_size).skip(valid_size)

if getVisualize():
  print(f"Train Size: {train_size}")
  print(f"Valid Size: {valid_size}")
  print(f"Test Size: {test_size}")