In [1]:
import numpy as np
import math
import xarray as xr
import dask
from sklearn.model_selection import train_test_split
import tensorflow as tf
import gc

Import the data sample.  This one has already been spliced by lat/lon and Vertical Velocity pulled out

In [2]:
path = '/DFS-L/DATA/pritchard/gmooers/Workflow/MAPS/SPCAM/Small_Sample/Data_Points/One_Day_Merged_Data.nc'
real_ds = xr.open_dataset(path)

In [3]:
w_velocity = real_ds['CRM_W'].values
w_velocity = np.squeeze(w_velocity)

In [4]:
print(w_velocity.shape)

(109, 96, 30, 128)


This time I want to shuffle in both space and time, so I will combine the first two dimensions using the reshape function

In [7]:
t = len(w_velocity[0])
coords = len(w_velocity)
lev = len(w_velocity[0][0])
crm_x = len(w_velocity[0][0][0])
w_new = np.reshape(w_velocity, (coords*t, lev, crm_x))

In [9]:
np.isnan(w_new).any()

False

For the Morphology Tests, e.g. feeding in low resolution image snap shots, I do not want a diurnal cycle, so I will shuffle by time:

https://www.tensorflow.org/api_docs/python/tf/random/shuffle

I seem to need to use a tensorflow built in function to do this on an array more than two dimensions....

In [10]:
w_shuffled = tf.random.shuffle(w_new, seed=None, name=None)
sess = tf.InteractiveSession()
w_numpy = w_shuffled.eval()
gc.collect()

10

Need to split data into training and test sections:

Will do an 80/20 split for now

In [12]:
w_train = w_numpy[:int(4*len(w_numpy)/5),:,:]
w_test = w_numpy[int(4*len(w_numpy)/5):,:,:]

Must scale all array values to between 0 and 1

Seems standardization not normalization is apropriate
- both training and validation data

https://stats.stackexchange.com/questions/10289/whats-the-difference-between-normalization-and-standardization

Method 1:

Assign z scores centered around $\mu$ of 0 and $\sigma$ = 1
Standardization:

$X^` = \frac{x - \mu}{\sigma}$

In [13]:
rescaled_train = (w_train - w_train.mean(axis=(1,2),keepdims=1)) / w_train.std(axis=(1,2),keepdims=1)
rescaled_test = (w_test - w_test.mean(axis=(1,2),keepdims=1)) / w_test.std(axis=(1,2),keepdims=1)

Method 2:

Normalization: Scale each value in arrray between 0 to 1.  This seems to be method of choice in most "image" problems where they divide by 255. to get pixels between 0 and 1, so I will defer to it for now?

$X^` = \frac{x - min(x)}{max(x)-min(x)}$

The built in interpolation function will allow this to easily be done in a line of code

https://stackoverflow.com/questions/36000843/scale-numpy-array-to-certain-range

In [17]:
rescaled_train = np.interp(w_train, (w_train.min(), w_train.max()), (0, +1))
rescaled_test = np.interp(w_test, (w_train.min(), w_train.max()), (0, +1))

Save as .npy files for VAE Analysis

In [19]:
np.save('/fast/gmooers/Preprocessed_Data/W_Trial/Space_time_W_Training.npy', rescaled_train)
np.save('/fast/gmooers/Preprocessed_Data/W_Trial/Space_Time_W_Test.npy', rescaled_test)