# IEMS 490 Wave Training
Trains an open source version of Google's Wavenet architecture to fit our 'waves' from our LaserPowerCurrent and SignalPdInGaAs variables

In [22]:
import torch
from torch_fn.wavenet_lstm import lr_schedule, WaveNet_LSTM #LSTM of WaveNet
from torch_fn.wavenet import WaveNet # Normal WaveNet, likely from the paper. Need someone who is literate to check.
import h5py
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torch.nn as nn
import torch.optim as optim

### For each time step in each file, get the Laser Power and Signal
We don't want to use X, Y because that can change in each layer

In [2]:
folder_path = './DATASET' # Replace with your folder path
# This creates a list of extensions (including the dot)
file_paths = [f for f in Path(folder_path).iterdir() if f.is_file() and f.suffix == '.hdf5']

print(file_paths)

[WindowsPath('DATASET/layer1.hdf5'), WindowsPath('DATASET/layer10.hdf5'), WindowsPath('DATASET/layer100.hdf5'), WindowsPath('DATASET/layer101.hdf5'), WindowsPath('DATASET/layer102.hdf5'), WindowsPath('DATASET/layer103.hdf5'), WindowsPath('DATASET/layer104.hdf5'), WindowsPath('DATASET/layer105.hdf5'), WindowsPath('DATASET/layer106.hdf5'), WindowsPath('DATASET/layer107.hdf5'), WindowsPath('DATASET/layer108.hdf5'), WindowsPath('DATASET/layer109.hdf5'), WindowsPath('DATASET/layer11.hdf5'), WindowsPath('DATASET/layer110.hdf5'), WindowsPath('DATASET/layer111.hdf5'), WindowsPath('DATASET/layer112.hdf5'), WindowsPath('DATASET/layer113.hdf5'), WindowsPath('DATASET/layer114.hdf5'), WindowsPath('DATASET/layer115.hdf5'), WindowsPath('DATASET/layer116.hdf5'), WindowsPath('DATASET/layer117.hdf5'), WindowsPath('DATASET/layer118.hdf5'), WindowsPath('DATASET/layer119.hdf5'), WindowsPath('DATASET/layer12.hdf5'), WindowsPath('DATASET/layer120.hdf5'), WindowsPath('DATASET/layer121.hdf5'), WindowsPath('DAT

In [3]:
file1 = h5py.File(file_paths[0], 'r')
file1

<HDF5 file "layer1.hdf5" (mode r)>

In [92]:
#data = []
lazer_data = []
signal_data = []
min_rows = 32000
max_rows = 0
for file in file_paths:
    file_info = h5py.File(file, 'r')
    file_data = file_info['OpenData']
    lazer = file_data[5]
    signal = file_data[6]
    #print(len(lazer))
    #print(len(signal))
    if len(lazer) != len(signal):
        print(f"There is an issue with {file}")
    if len(lazer) < min_rows:
        min_rows = len(lazer)
    if len(lazer) > max_rows:
        max_rows = len(lazer)
    lazer_data.append(lazer)
    signal_data.append(signal)
    #data.append([lazer, signal])
print(f"The minimum number of data points in a single file is {min_rows}")
print(f"The maximum number of data points in a single file is {max_rows}")

The minimum number of data points in a single file is 9560
The maximum number of data points in a single file is 31135


### We need to get all data to the same dimensions

In [93]:
#dim_lazer_data = lazer_data
for i in range(len(lazer_data)):
    #dim_lazer_data[i] = np.array([0.0,0.0, 1.0])
    if len(lazer_data[i]) < max_rows:
        diff = max_rows - len(lazer_data[i])
        lazer_data[i]=np.concatenate((lazer_data[i],np.zeros(diff,)), axis=0)

In [94]:
min_rows = 32000
max_rows = 0
for dim in lazer_data:
    if len(dim) < min_rows:
        min_rows = len(dim)
    if len(dim) > max_rows:
        max_rows = len(dim)
    #data.append([lazer, signal])
print(f"The minimum number of data points in a single file is {min_rows}")
print(f"The maximum number of data points in a single file is {max_rows}")

The minimum number of data points in a single file is 31135
The maximum number of data points in a single file is 31135


In [95]:
# Split and turn to tensor
train_x, test_x = train_test_split(lazer_data, test_size=0.6)

In [105]:
#
# This errors because the dimensions are not consistent. There are three approaches, we can normalize the x and y and feed them as inputs and pad missing values
# OR we keep only lazer and signal and pad missing values OR we only sample ~300 points from each data sample
# 
x_train_tensor = torch.tensor(train_x)
x_test_tensor  = torch.tensor(test_x)

train_dataset = TensorDataset(x_train_tensor, x_train_tensor)
test_dataset  = TensorDataset(x_test_tensor, x_test_tensor)
batch_size = 1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Below code is buggy because padding is messed up

In [53]:
#import torch
from torch.nn.utils.rnn import pad_sequence

# Combine x and y into coordinate pairs for each path
# List of tensors, each of shape (length_i, 2)
path_tensors = [torch.tensor(np.stack([lazer[i], signal[i]], axis=0), dtype=torch.float32) 
                for i in range(len(lazer))]

# Pad so all paths match the longest path
padded_paths = pad_sequence(path_tensors, batch_first=True, padding_value=0.0)

print(padded_paths.shape) # Expected: (Num_Samples, Max_Length, 2)

torch.Size([30575, 2])


In [6]:
# Split and turn to tensor
train_x, test_x = train_test_split(padded_paths, test_size=0.6)

In [42]:
train_x.shape

torch.Size([12230, 2])

In [39]:
#
# This errors because the dimensions are not consistent. There are three approaches, we can normalize the x and y and feed them as inputs and pad missing values
# OR we keep only lazer and signal and pad missing values OR we only sample ~300 points from each data sample
# 
x_train_tensor = torch.tensor(train_x)
x_test_tensor  = torch.tensor(test_x)

train_dataset = TensorDataset(x_train_tensor, x_train_tensor)
test_dataset  = TensorDataset(x_test_tensor, x_test_tensor)
batch_size = 5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  x_train_tensor = torch.tensor(train_x)
  x_test_tensor  = torch.tensor(test_x)


In [40]:
x_train_tensor.shape

torch.Size([12230, 2])

In [None]:
len(train_dataset)

12230

In [107]:
def train(model, dataloader, valid_dataloader, epochs = 10, lr=1e-3):
    #model.train()
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    train_losses = []
    valid_losses = []
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch, _ in dataloader:
            if batch.dim() == 2:
                # Turn [16, 30575] into [16, 30575, 1]
                batch = batch.unsqueeze(-1)
                print(batch.shape)
            #batch = batch.permute(0,2,1).float()
            outputs = model(batch)
            optimizer.zero_grad()
            loss = criterion(outputs, batch)
            loss.backward()
            optimizer.step()
            train_loss+= loss.item() * batch.size(0)
        epoch_loss = train_loss / len(dataloader.dataset)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")
        train_losses.append(train_loss)
        valid_losses.append(test(model, valid_dataloader))
    return train_losses, valid_losses

def test(model, dataloader):
    model.eval()
    test_loss = 0
    criterion = nn.MSELoss()
    with torch.no_grad():
       for batch, _ in dataloader:
          batch = batch.permute(0,2,1).float()
          outputs = model(batch)
          loss = criterion(batch, outputs)
          test_loss += loss.item() * batch.size(0)
    print(f"Test loss: {test_loss:.4f}")
    return test_loss


In [52]:
wave_model = WaveNet(input_size=2, out_channels=2, kernel_size=2, n=1)
#train_loss, valid_loss = train(wave_model, train_loader, test_loader)

### We might be able to train BERT like encoder to fill in the blanks