# Try to predict rho with the smallest turbulence dataset

In [2]:
import numpy as np
import os
import torch
from utils import *
from athena_read import *
from _arfima import arfima

## Read the data with athena_read

In [3]:
data_path='C:/Users/52673/Desktop/NYU MSDS/3-DS-1006 CAPSTONE/data_turb_dedt1_16'

In [4]:
# lets take a look at what a sample looks like
lst = sorted(os.listdir(data_path))[4:]
sample_path = data_path + '/' + lst[4]
sample = athdf(sample_path)
sample['x3v']

array([-0.46875, -0.40625, -0.34375, -0.28125, -0.21875, -0.15625,
       -0.09375, -0.03125,  0.03125,  0.09375,  0.15625,  0.21875,
        0.28125,  0.34375,  0.40625,  0.46875], dtype=float32)

In [5]:
print('size of rho:', np.shape(sample['rho']))

size of rho: (16, 16, 16)


Q: Here the size of the matrix 'rho' for each timestep is (16,16,16). I interprete it like this: When I look into the setting that generate the dataset, I noticed that the mesh-block is set to be 16 for each axis x1, x2 and x3, so it is like cutting a large cube alone with its length, width, and height, each 16 times, and divide it into $16*16*16$ small cubes. So the matrix 'rho' represents the density of each small cube. 

Need to be confirmed.

## Extract rho and time from the original dataset

In [6]:
time = []
rho = []
for name in lst:
    path = data_path+'/'+name
    d = athdf(path)
    time.append(d['Time'])
    rho.append(d['rho'])

rho = np.array(rho)

In [7]:
# check whether the time is evenly distributed
time_gap = []
for i in range(1,len(time)):
    time_gap.append(time[i] - time[i-1])

print('gap between two time steps: ', time_gap)

gap between two time steps:  [0.008732914, 0.004386274, 0.004412315, 0.0044503715, 0.0044987686, 0.004519906, 0.004550522, 0.0045853965, 0.009305038, 0.004717216, 0.0047579333, 0.0048104264, 0.0048691407, 0.0049307495, 0.0049957708, 0.005053267, 0.0051188394, 0.0051931813, 0.0052760392, 0.0053668246, 0.005464494, 0.005568497, 0.0055926517, 0.005605586, 0.005638182, 0.0056675375, 0.0056544095, 0.005655691, 0.0056724995, 0.0057038367, 0.005744025, 0.005776748, 0.005776122, 0.00578624, 0.0058072805, 0.005786702, 0.0057302415, 0.005693823, 0.0056090504, 0.005554706, 0.005525619, 0.0055146664, 0.0055193305, 0.00553903, 0.0055715293, 0.0056138337, 0.0056639016, 0.0057087243, 0.005660951, 0.0056361556, 0.005596459, 0.005533755, 0.0055256784, 0.0055434406, 0.0055660605, 0.005599469, 0.005648643, 0.005661726, 0.00566715, 0.005690187, 0.0057277083, 0.0057603717, 0.005774826, 0.0058030784, 0.0058419406, 0.005887121, 0.005936116, 0.0059885383, 0.0060466826, 0.006113082, 0.0061888993, 0.006265551, 

Q: Here we notice that the time is not evenly distributed, so we might want to encode time into the transformer as well? But how?

## Preprocess on rho

In [8]:
ntime, nx1, nx2, nx3 = np.shape(rho)
ntime, nx1, nx2, nx3

(161, 16, 16, 16)

In [9]:
rho_reshaped = rho.flatten()
np.shape(rho_reshaped)

(659456,)

In [10]:
# check that the reshape does not lose the positional information
time = 131
x1 = 5
x2 = 1
x3 = 3

print(rho[time][x1][x2][x3])
print(rho_reshaped[time*(nx1*nx2*nx3)+x1*(nx2*nx3)+x2*(nx3)+x3])

1.0885758
1.0885758


In [18]:
f = [0+(nx1*nx2*nx3)*i for i in range(10)]
t = [0+(nx1*nx2*nx3)*(i+1) for i in range(10)]
print('the indices for an example pair of feature and target is:')
print('feature: ', f)
print('target: ', t)

the indices for an example pair of feature and target is:
feature:  [0, 4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 36864]
target:  [4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 36864, 40960]


In [19]:
def to_windowed(data,meshed_blocks,window_size,pred_size):
    nx1, nx2, nx3 = meshed_blocks
    n = nx1*nx2*nx3
    out = []
    for i in range(len(data)-n*window_size):
        feature  = np.array(data[[i+n*k for k in range(window_size)]])
        target = np.array(data[[i+n*(k+pred_size) for k in range(window_size)]])        
        out.append((feature,target))

    return np.array(out)

In [23]:
meshed_blocks = (nx1,nx2,nx3)
np.shape(to_windowed(rho_reshaped,meshed_blocks,10,1))

(618496, 2, 10)

### Further check with the edited ultils function

In [24]:
train_loader,val_loader, test_loader = get_data_loaders(train_proportion = 0.5, test_proportion = 0.25, val_proportion = 0.25,window_size = 10, \
    pred_size =1, batch_size = 16, num_workers = 1, pin_memory = True, test_mode = False)

In [25]:
for i,(data, targets) in enumerate(train_loader):  
            data, targets = data, targets

In [26]:
targets.shape

torch.Size([16, 10, 1])

After adding the size of hidden layer, this would match with the required size to feed into the transformer (batch_size, sequence_length, feature_size)