# Data Visualization

## Imports

In [1]:
from pathlib import Path
import numpy as np

## Path

In [2]:
# replace the path with where data is stored
# data_dir = Path("../../../ddemler/dima_stuff/wa_remake/May12_full_converted_dataset")
data_dir    = Path ("../../../ddemler/dima_stuff/wa_remake/May_15_processed")

## Print

In [3]:
for f in data_dir.rglob("*"):
    print(f.relative_to(data_dir))

combined_features.npy
combined_labels.npy
May_15zip_processed.zip


### The folder contains `combined_features.npy` and `combined_labels.npy`

## Example

In [4]:
feat = np.load(data_dir / "combined_features.npy")
label  = np.load(data_dir / "combined_labels.npy")

# no truncation
np.set_printoptions(threshold=np.inf, suppress=True, precision=6)

# print a sample of each
print(" feature sample (shape={}):".format(feat[0].shape))
print(feat[1])

print("\n label sample (shape={}):".format(label[0].shape if hasattr(label[0], "shape") else ()))
print(label[1])

 feature sample (shape=(18, 16)):
[[  128.    50.     0.   125.     8.     0.     8. 24574.     1.     2.
      0.     8.     4.     1.     1.     0.]
 [  125.     8.     0.    63.    16.     0.     8. 24574.     1.     2.
      0.    16.     5.     2.     2.     0.]
 [   63.    16.     0.    63.    16.     0.     8.     1.     1.     0.
      2.     0.     0.     0.     0.     0.]
 [   63.    16.     0.    63.     8.     0.     8. 24574.     1.     2.
      0.     8.     6.     1.     2.     0.]
 [   63.     8.     0.    63.     8.     0.     8.     1.     1.     0.
      2.     0.     0.     0.     0.     0.]
 [   63.     8.     0.    63.     8.     0.     8. 24574.     1.     2.
      0.     8.     2.     1.     2.     0.]
 [   63.     8.     0.    63.     8.     0.     8.     1.     1.     0.
      4.     0.     0.     0.     0.     0.]
 [   63.     8.     0.    61.    16.     0.     8. 24574.     1.     2.
      0.    16.     3.     1.     1.     0.]
 [   61.    16.     0.   976. 

## Explanation

The feature shape dimensionality is `(18, 16)`, where `16` is the number of `input_features` and `18` is the possible `num_layers`. It follows this format:

- `input_features = [
    "d_in1": 0,
    "d_in2": 1,
    "d_in3": 2,
    "d_out1": 3,
    "d_out2": 4,
    "d_out3": 5,
    "prec": 6,
    "rf": 7,
    "strategy": 8,
    "layer_type": 9,
    "activation_type": 10,
    "filters": 11,
    "kernel_size": 12,
    "stride": 13,
    "padding": 14,
    "pooling": 15]`

This is the encoding:
- `layer_type = [na = 0, dense=1, conv1d=2, conv2d=3, separableconv1d=4, separableconv2d=5, depthwiseconv1d=6, depthwiseconv2d=7, flatten=8, maxpooling=9, averagepooling=10]`
- `activation_type = [na= 0, linear=1, relu=2, tanh=3, sigmoid=4, softmax=5]`
- `padding = [na= 0, same=1, valid=2]`
- Note: always doing pooling=2 when its a pooling layer (here the 2 is a literal representation)
- can use zero-padding so if feature is undefined for a particular layer, just set it to 0

The label shape dimensionality is `(5,)`, where `5` is the number of `output_features`. It follows this format:
- `output_features = ["WorstLatency_hls", "IntervalMax_hls", "FF_hls", "LUT_hls", "BRAM_18K_hls", "DSP_hls"]`

Meaning of each:
- `WorstLatency_hls`: “cycles_max” in latency report
- `IntervalMax_hls`: “interval_max” in latency_report (not yet)
- `FF_hls`: “ff” in resource_report
- `LUT_hls`: “lut” in resource_report
- `BRAM_18K_hls`: “bram” in resource_report
- `DSP_hls`: “dsp” in resource_report

TODO:

Normalize the data:
- take the log of things like the rf
- Do simple transformations for features with very wide ranges (that are easy to undo)
    - to get order 1 for all the input
- Normalize everything, besides the encoded things
    - embed activation type and layer type differently
        - one-hot encoding
        - its own embedding
    - try as is for now though

Normalize the outputs as well

#### This is a dense model with 4 layers and the rest is "-1" padded:

`feature sample (shape=(18, 16)):
[[ 104.    0.    0.  128.    0.    0.   10. 2047.    1.    1.    0.    0.
     0.    0.    0.    0.]
 [ 128.    0.    0.  128.    0.    0.   10.    1.    1.    0.    1.    0.
     0.    0.    0.    0.]
 [ 128.    0.    0.  104.    0.    0.   10. 2047.    1.    1.    0.    0.
     0.    0.    0.    0.]
 [ 104.    0.    0.  104.    0.    0.   10.    1.    1.    0.    1.    0.
     0.    0.    0.    0.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]
 [  -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.   -1.
    -1.   -1.   -1.   -1.]]`

#### And the model's corresponding resource utilization:
` label sample (shape=(5,)):
[ 3333. 19907. 26178.    58.    16.]`

## Normalize the data

In [5]:
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import TensorDataset, DataLoader, random_split

data_dir    = Path("../../../ddemler/dima_stuff/wa_remake/May_15_processed")
feat_np     = np.load(data_dir / "combined_features.npy")  # (N, 18, 16)
label_np    = np.load(data_dir / "combined_labels.npy")    # (N,  5)

pad_mask_np = np.all(feat_np == -1, axis=-1)               # (N,18)
feat_np     = np.where(feat_np == -1, 0, feat_np)           # replace all -1 with 0

# normalize 
skip_idxs    = {8, 9, 10, 15} # strategy, layer_type, activation_type, pooling
norm_idxs    = [i for i in range(feat_np.shape[2]) if i not in skip_idxs]

valid_rows = feat_np[~pad_mask_np].reshape(-1, feat_np.shape[2])  # (num_valid_rows, 16)

means = valid_rows[:, norm_idxs].mean(axis=0)
stds  = valid_rows[:, norm_idxs].std(axis=0)
stds[stds < 1e-5] = 1.0

# z-score norma
feat_np[..., norm_idxs] = (
    (feat_np[..., norm_idxs] - means) / stds
)

# convert to torch tensors
feat     = torch.from_numpy(feat_np).float()                
pad_mask = torch.from_numpy(pad_mask_np)                    
label    = torch.from_numpy(label_np).float()               

dataset    = TensorDataset(feat, pad_mask, label)
N          = len(dataset)
train_size = int(0.7 * N)
generator  = torch.Generator().manual_seed(42)
train_ds, test_ds = random_split(dataset, [train_size, N-train_size], generator=generator)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=512, shuffle=False)

In [6]:
if not feat_path.exists():
    print(f"File not found: {feat_path}. Please verify your data path.")
else:
    # Load and preprocess
    feat_np = np.load(feat_path)                       # (N,18,16)
    pad_mask_np = np.all(feat_np == -1, axis=-1)       # (N,18)
    feat_np = np.where(feat_np == -1, 0, feat_np)

    # Normalize all except categorical columns
    skip_idxs = {8, 9, 10, 15}
    norm_idxs = [i for i in range(feat_np.shape[2]) if i not in skip_idxs]

    valid_rows = feat_np[~pad_mask_np].reshape(-1, feat_np.shape[2])
    means = valid_rows[:, norm_idxs].mean(axis=0)
    stds = valid_rows[:, norm_idxs].std(axis=0)
    stds[stds < 1e-5] = 1.0
    feat_np[..., norm_idxs] = (feat_np[..., norm_idxs] - means) / stds

    # Column names
    columns = [
        "d_in1","d_in2","d_in3","d_out1","d_out2","d_out3",
        "prec","rf","strategy","layer_type","activation_type",
        "filters","kernel_size","stride","padding","pooling"
    ]

    # Plot histograms for each normalized numeric feature
    for idx in norm_idxs:
        data = feat_np[~pad_mask_np, idx]
        plt.figure()
        plt.hist(data, bins=50)
        plt.title(f"Normalized '{columns[idx]}' distribution")
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.show()

NameError: name 'feat_path' is not defined

## Load data and define the architecture in `transformer.ipynb`