In [1]:
import pickle


# activate line execution
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from fastNLP import DataSet, DataSetIter, RandomSampler, SequentialSampler
from fastNLP import seq_len_to_mask


import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import numpy as np



  from .autonotebook import tqdm as notebook_tqdm


In [2]:

add_physio = "/mlodata1/hokarami/RTSGAN3/data/physio_data/full2012_v2.pkl"


# open the pickle file
with open(add_physio, "rb") as f:
    dataset = pickle.load(f)

train_set = dataset["train_set"]
dynamic_processor = dataset["dynamic_processor"]
static_processor = dataset["static_processor"]


if dataset['train_set'].has_field('dt'):
    train_set.set_input("dyn", "mask", "sta", "times", "lag",
                        "seq_len", "priv", "nex", "label", "dt")
else:
    train_set.set_input("dyn", "mask", "sta", "times", "lag",
                        "seq_len", "priv", "nex", "label")

if dataset['train_set'].has_field('times_raw'):
    train_set.set_input("dyn", "mask", "sta", "times", "lag",
                        "seq_len", "priv", "nex", "label", "times_raw")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------+---------------+
| seq_len | dyn         | lag         | mask        | sta         | times       | priv        | nex         | label | times_raw     |
+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------+---------------+
| 3       | [[0.0, 0... | [[0.0652... | [[0.0, 0... | [0.52   ... | [[0.0652... | [[0.0, 0... | [[0.0, 0... | 0.0   | [3.1333333... |
| 23      | [[0.0, 0... | [[0.0145... | [[0.0, 0... | [0.81333... | [[0.0145... | [[0.0, 0... | [[0.0, 0... | 0.0   | [0.7, 1.18... |
| 27      | [[0.3953... | [[0.0333... | [[1.0, 1... | [0.38666... | [[0.0333... | [[0.0, 0... | [[0.0, 0... | 0.0   | [1.6, 1.75... |
| 7       | [[0.7906... | [[0.0038... | [[1.0, 1... | [0.70666... | [[0.0038... | [[0.0, 0... | [[0.0, 0... | 0.0   | [0.1833333... |
| 7       | [[0.0, 0... | [[0.1225... | [[0.0, 0... | [0.97333

+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------+---------------+
| seq_len | dyn         | lag         | mask        | sta         | times       | priv        | nex         | label | times_raw     |
+---------+-------------+-------------+-------------+-------------+-------------+-------------+-------------+-------+---------------+
| 3       | [[0.0, 0... | [[0.0652... | [[0.0, 0... | [0.52   ... | [[0.0652... | [[0.0, 0... | [[0.0, 0... | 0.0   | [3.1333333... |
| 23      | [[0.0, 0... | [[0.0145... | [[0.0, 0... | [0.81333... | [[0.0145... | [[0.0, 0... | [[0.0, 0... | 0.0   | [0.7, 1.18... |
| 27      | [[0.3953... | [[0.0333... | [[1.0, 1... | [0.38666... | [[0.0333... | [[0.0, 0... | [[0.0, 0... | 0.0   | [1.6, 1.75... |
| 7       | [[0.7906... | [[0.0038... | [[1.0, 1... | [0.70666... | [[0.0038... | [[0.0, 0... | [[0.0, 0... | 0.0   | [0.1833333... |
| 7       | [[0.0, 0... | [[0.1225... | [[0.0, 0... | [0.97333

In [3]:

def pad_mask(mask, times_raw, seq_len):
    last_t = times_raw[np.arange(
        seq_len.shape[0]), (seq_len-1).tolist()]
    mask2 = torch.zeros(mask.shape[0], int(
        max(last_t))+1, mask.shape[2]).to(mask.device)

    MAX_LEN = 64
    mask2 = torch.zeros(
        mask.shape[0], MAX_LEN, mask.shape[2]).to(mask.device)
    for i in range(mask2.shape[0]):
        # mask2[i, times_raw[i][:seq_len[i]].int().tolist()
        #       ] = mask[i, :seq_len[i]]
        unique_times = torch.unique(times_raw[i][:seq_len[i]].int())

        # this is the biggest index of unique times
        lookup_index = torch.cumsum(torch.unique(times_raw[i][:seq_len[i]].int(), return_counts=True)[1],dim=0)-1
        
        mask2[i, unique_times] = mask[i, lookup_index]
        
        # check
        if not (set(times_raw[i][:seq_len[i]].int().tolist())) == set(torch.nonzero(mask2[i].sum(1)).flatten().tolist()):
            aaa=1

            # this is the biggest index of unique times
            # torch.cumsum(torch.unique(times_raw[i][:seq_len[i]].int(), return_counts=True)[1],dim=0)-1



    return mask2


In [11]:
train_batch = DataSetIter(
            dataset=train_set, batch_size=64)
all_masks = []
all_dyn = []
for batch_x, batch_y in train_batch:
    sta = batch_x["sta"]
    dyn = batch_x["dyn"]
    mask = batch_x["mask"]
    lag = batch_x["lag"]
    priv = batch_x["priv"]
    nex = batch_x["nex"]
    times = batch_x["times"]
    seq_len = batch_x["seq_len"]
    if "dt" in batch_x:
        dt = batch_x["dt"]
    if "times_raw" in batch_x:
        times_raw = batch_x["times_raw"]

    mask2 = pad_mask(mask, times_raw, seq_len)
    dyn2 = pad_mask(dyn, times_raw, seq_len)
    all_masks.append(mask2)
    all_dyn.append(dyn2)

In [12]:
mask.shape, mask2.shape
dyn.shape, dyn2.shape
dyn2.min(), dyn2.max(), dyn2.mean(), dyn2.std()

(torch.Size([58, 72, 23]), torch.Size([58, 64, 23]))

(torch.Size([58, 72, 23]), torch.Size([58, 64, 23]))

(tensor(0.), tensor(1.), tensor(0.0087), tensor(0.0638))

In [14]:
mask2.shape
all_masks = torch.cat(all_masks, dim=0)
all_dyn = torch.cat(all_dyn, dim=0)

all_masks.shape
all_dyn.shape

torch.Size([58, 64, 23])

torch.Size([3578, 64, 23])

torch.Size([3578, 64, 23])

In [24]:
all_masks.shape

torch.Size([3578, 64, 23])

In [15]:
target_dim = 64
padding_needed = target_dim - all_masks.shape[-1]

all_masks_padded = torch.nn.functional.pad(all_masks, (0, padding_needed))
all_dyn_padded = torch.nn.functional.pad(all_dyn, (0, padding_needed))
all_masks_padded.shape
all_dyn_padded.shape



torch.Size([3578, 64, 64])

torch.Size([3578, 64, 64])

In [16]:
all_data = torch.stack([all_dyn_padded, all_masks_padded], dim=1)
all_data.shape

torch.Size([3578, 2, 64, 64])

In [17]:
class Physio(Dataset):
    def __init__(self, all_data, transform=None):
        self.num_samples = all_data.shape[0]
        self.transform = transform

        # Generate random data
        # self.data = torch.rand((num_samples, 28, 28), dtype=torch.float32)
        self.data = all_data # num_samples, 64, 8
        self.labels = torch.randint(0, 10, (self.num_samples,), dtype=torch.int64)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label

In [20]:
ph = Physio(all_data)

In [21]:

# Serialize and save the dataset object using pickle
path2save = "data/physio_data/torch_physio_full.pkl"
with open(path2save, 'wb') as file:
    pickle.dump(ph, file)