creating interface for packing the samples from the dataset class

In [1]:

import numpy as np
from torch.utils.data import Dataset,DataLoader
from dataset import PhysioNet
import torch

In [2]:
from packing import ExamplePacking

In [3]:

data = PhysioNet(activity="all",include_rest=True)



.... found 1526 edf files ....
---- data loaded from total of 10 -----


In [4]:
data.eeg_raw_x.shape

(246, 64, 321)

In [5]:

class Pack(Dataset):
    """Interface for packing the examples into one sequence"""

    def __init__(self,arr1,arr2,max_seq=4*160,max_search_depth=17) -> None:

        super().__init__()

        """
        arr1 nparray : covariates of the dataset  
        arr2 nparray : lables of the dataset           
        """

        self.max_bin_size = max_seq
        self.max_search_depth = max_search_depth
        
        self.arr1 = arr1
        self.arr2 = arr2

        # print("arr1 ",arr1)
        # print("arr2 ",arr2)

        self.packed_arr1,self.packed_arr2,self.packing_identifiers =  self.pack(arr1.copy(),arr2.copy())

    def standardize_rows(self,arr):
        """
        Standardize each row of the array to have a mean of 0 and a standard deviation of 1.
        
        Parameters:
        arr (numpy.ndarray): Input array of shape (64, 80)
        
        Returns:
        numpy.ndarray: Standardized array with the same shape as input
        """
        # Calculate the mean and standard deviation for each row
        row_mean = arr.mean(axis=1, keepdims=True)
        row_std = arr.std(axis=1, keepdims=True)
        
        # Standardize each row
        standardized_arr = (arr - row_mean) / row_std
        
        return standardized_arr
    

    def pack(self,samples,labels):

        # samples = samples.copy()
        # labels = labels.copy()


        packing_identifiers = [] # used to trace back the position and length of original sequences in a packed sequence
        bins = [] # contains the packed sequences
        bin_labels = [] # contains the respective labels of the packed sequences



        while len(samples) > self.max_search_depth:
            
            #### filling a new bin

            bin_used = 0
            depth_explored = 0


            identifier = np.full(shape=(self.max_bin_size),fill_value=-1).astype(np.float16)
            # print("identifier.shape ",identifier.shape)
            
            tar_bin = np.full(shape=(self.arr1[0].shape[0],self.max_bin_size),fill_value=-1).astype(np.float32)
            tar_labels = np.full(shape=(self.max_bin_size,),fill_value=-1)

            examples_added = 0 # counter for no of examples packed


            while bin_used < self.max_bin_size and depth_explored <= self.max_search_depth and len(samples) > self.max_search_depth: 
                # print("len(samples) ",len(samples))
                # print("depth_explored",depth_explored)
                
                available_bin = self.max_bin_size - bin_used

                curr_seq_length = samples[depth_explored].shape[-1]
                # adding the sample into bin if it does not exceed the max size

                if curr_seq_length <= available_bin :

                    #### adding one sample to a bin


                    
                    


                    
                    # print("samples[depth_explored] ",samples[depth_explored])
                    # print("before tar_bin ",tar_bin[:,bin_used:bin_used+curr_seq_length])

                    tar_bin[:,bin_used:bin_used+curr_seq_length] = self.standardize_rows(samples[depth_explored])
                    
                    # print("tar_bin afters ",tar_bin[:,bin_used:bin_used+curr_seq_length])
                    # print(f"Sample shape: {samples[depth_explored].shape}, Bin shape: {tar_bin[:, bin_used:bin_used + curr_seq_length].shape}")


                    tar_labels[bin_used:bin_used+curr_seq_length] = labels[depth_explored]

                    samples.pop(depth_explored)
                    label = labels.pop(depth_explored)

                    # print(label)

                    identifier[bin_used:bin_used+curr_seq_length] = examples_added

    

                    examples_added += 1

                    depth_explored = 0
                    bin_used  += curr_seq_length

                else:

                    # if sample can not be added incrementing the depth_explored
                    depth_explored += 1
            




            packing_identifiers.append(identifier)
            bins.append(tar_bin)
            bin_labels.append(tar_labels)

        return bins,bin_labels,packing_identifiers
    
    def __len__(self):

        return len(self.packed_arr1)
    

    def __getitem__(self, idx):

        return self.packed_arr1[idx].T,self.packed_arr2[idx],self.packing_identifiers[idx]

In [6]:
total_samples = data.eeg_raw_windowed.shape[0] 
variable_lengths = (np.round(np.random.uniform(low=0.4,high=2.01,size=(total_samples)),decimals=1) * 160)


In [7]:
variable_lengths = variable_lengths.astype(np.int64)
variable_len_covariates = []
for idx,sample in enumerate(data.eeg_raw_x):    

    variable_len_covariates.append(sample[:,:variable_lengths[idx]])

In [8]:

class Pack(Dataset):
    """Interface for packing the examples into one sequence"""

    def __init__(self,arr1,arr2,max_seq=4*160,max_search_depth=17) -> None:

        super().__init__()

        """
        arr1 nparray : covariates of the dataset  
        arr2 nparray : lables of the dataset           
        """

        self.max_bin_size = max_seq
        self.max_search_depth = max_search_depth
        
        self.arr1 = arr1
        self.arr2 = arr2

        # print("arr1 ",arr1)
        # print("arr2 ",arr2)

        self.packed_arr1,self.packed_arr2,self.packing_identifiers =  self.pack(arr1.copy(),arr2.copy())

    def standardize_rows(self,arr):
        """
        Standardize each row of the array to have a mean of 0 and a standard deviation of 1.
        
        Parameters:
        arr (numpy.ndarray): Input array of shape (64, 80)
        
        Returns:
        numpy.ndarray: Standardized array with the same shape as input
        """
        # Calculate the mean and standard deviation for each row
        row_mean = arr.mean(axis=1, keepdims=True)
        row_std = arr.std(axis=1, keepdims=True)
        
        # Standardize each row
        standardized_arr = (arr - row_mean) / row_std
        
        return standardized_arr
    

    def pack(self,samples,labels):

        packing_identifiers = [] # used to trace back the position and length of original sequences in a packed sequence
        bins = [] # contains the packed sequences
        bin_labels = [] # contains the respective labels of the packed sequences


        while len(samples) > self.max_search_depth:
            
            #### filling a new bin

            bin_used = 0
            depth_explored = 0


            identifier = np.full(shape=(self.max_bin_size),fill_value=-1).astype(np.float16)
            # print("identifier.shape ",identifier.shape)
            
            tar_bin = np.full(shape=(self.arr1[0].shape[0],self.max_bin_size),fill_value=-1).astype(np.float32)
            tar_labels = np.full(shape=(self.max_bin_size,),fill_value=-1)

            examples_added = 0 # counter for no of examples packed


            while bin_used < self.max_bin_size and depth_explored <= self.max_search_depth and len(samples) > self.max_search_depth: 
                # print("len(samples) ",len(samples))
                # print("depth_explored",depth_explored)
                
                available_bin = self.max_bin_size - bin_used

                curr_seq_length = samples[depth_explored].shape[-1]
                # adding the sample into bin if it does not exceed the max size

                if curr_seq_length <= available_bin :

                    #### adding one sample to a bin


                    
                    # print("samples[depth_explored] ",samples[depth_explored])
                    # print("before tar_bin ",tar_bin[:,bin_used:bin_used+curr_seq_length])

                    tar_bin[:,bin_used:bin_used+curr_seq_length] = self.standardize_rows(samples[depth_explored])
                    
                    # print("tar_bin afters ",tar_bin[:,bin_used:bin_used+curr_seq_length])
                    # print(f"Sample shape: {samples[depth_explored].shape}, Bin shape: {tar_bin[:, bin_used:bin_used + curr_seq_length].shape}")


                    tar_labels[bin_used:bin_used+curr_seq_length] = labels[depth_explored]

                    samples.pop(depth_explored)
                    labels.pop(depth_explored)

                    # print(label)

                    identifier[bin_used:bin_used+curr_seq_length] = examples_added

                    examples_added += 1

                    depth_explored = 0
                    bin_used  += curr_seq_length

                else:

                    # if sample can not be added incrementing the depth_explored
                    depth_explored += 1
            

            packing_identifiers.append(identifier)
            bins.append(tar_bin)
            bin_labels.append(tar_labels)

        return bins,bin_labels,packing_identifiers
    
    def __len__(self):

        return len(self.packed_arr1)
    

    def __getitem__(self, idx):

        return self.packed_arr1[idx].T,self.packed_arr2[idx],self.packing_identifiers[idx]

In [9]:
for i in variable_len_covariates:
    print(i.shape)
    break

(64, 240)


In [10]:
# samples,labels,identifiers = ExamplePacking.pack(samples=variable_len_covariates,labels=data.eeg_data_y.tolist())

In [11]:
# seqs, labs =ExamplePacking.unpack(samples,labels,identifiers,max_pool=False)

In [12]:
# seqs[0][0] == samples[0][0][:160]
# seqs[1][0] == samples[0][0][160:240]

In [13]:
# len(samples),len(labels),len(identifiers)

In [14]:
packed_dataset = Pack(arr1=variable_len_covariates,arr2=list(data.eeg_y_windowed),max_seq=3*160)

In [15]:
for packed in packed_dataset:
    eeg,labels,iden = packed
    print(eeg.shape)


(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)
(480, 64)


In [16]:
len(iden)

480

In [17]:
def create_attention_mask_batch(batch_example_list):
    """
    Create an attention mask for a batch of sequences where different examples 
    within each sequence do not attend to each other.
    
    Args:
    - batch_example_list (list of lists or 2D tensor): A batch of sequences 
      representing the examples, e.g., [[1, 1, 1, 2, 2], [1, 2, 2, 3, 3]]
    
    Returns:
    - attention_mask (torch.Tensor): A 3D tensor (batch_size, seq_len, seq_len), 
                                      where 1 means positions can attend to each other,
                                      and 0 means they cannot.
    """
    # Convert the input list to a 2D tensor (batch_size, seq_len)
    batch_example_list = np.array(batch_example_list)
    sequence_tensor = torch.tensor(batch_example_list)
    
    # Create a comparison matrix for each batch: (batch_size, seq_len, seq_len)
    attention_mask = (sequence_tensor.unsqueeze(1) == sequence_tensor.unsqueeze(2))
    
    return attention_mask

In [18]:
from networks import Temporal_Encoder

In [19]:
tenc = Temporal_Encoder()

In [20]:
loader = DataLoader(dataset=packed_dataset,batch_size=8)

In [21]:
for data in loader:
    eeg, labels, identifiers = data 
    print(eeg.shape)  # Ensure it's (8, num_channels, signal_length)
    print(labels.shape)  # Ensure this shape is as expected
    print(identifiers.shape)  # Check shape of identifiers

    # Create attention mask
    masks = create_attention_mask_batch(identifiers)
    print("masks.shape", masks.shape)  # Ensure it's torch.Size([8, 480, 480])
    masks = masks.repeat_interleave(8, dim=0)

    # Forward pass with EEG data and mask
    output = tenc(eeg, masks)

    break  # Assuming you're breaking for debugging purposes


torch.Size([8, 480, 64])
torch.Size([8, 480])
torch.Size([8, 480])
masks.shape torch.Size([8, 480, 480])


In [22]:
output

tensor([[[ 4.5930e-01,  2.8274e-01, -1.1131e-01,  ...,  6.0622e-01,
           3.0650e-01,  2.6195e+00],
         [-6.1299e-01, -5.0991e-01, -7.9516e-01,  ...,  1.2648e+00,
           3.0891e-01,  2.3436e+00],
         [-7.9614e-01, -4.2468e-01, -1.3422e+00,  ...,  9.3737e-01,
           8.0326e-01,  2.3346e+00],
         ...,
         [ 1.5788e-01, -2.0893e+00,  3.3944e-01,  ..., -1.5054e+00,
           6.1906e-01,  1.4039e+00],
         [-4.7075e-01, -2.0760e+00, -2.7877e-01,  ..., -1.6566e+00,
          -2.4179e-01,  9.1007e-01],
         [ 1.2799e-01, -1.7113e+00, -3.9512e-01,  ..., -2.0863e+00,
          -2.9439e-01,  1.4370e+00]],

        [[ 9.7499e-01, -5.3609e-01, -2.3439e-01,  ..., -1.4338e+00,
          -9.0490e-01,  1.1090e+00],
         [ 1.3554e+00,  2.7740e-02,  9.1268e-02,  ..., -9.9386e-01,
          -5.8602e-01,  1.0366e+00],
         [ 1.4709e+00, -1.2734e-01,  4.1962e-01,  ..., -8.5259e-01,
          -9.7013e-02,  1.2469e+00],
         ...,
         [ 9.7218e-02, -2

In [23]:

identifiers[4],np.unique(identifiers[0],return_index=True)[1]

(tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1.,

In [24]:
def unpack(y,true_lables,identifiers,max_pool=True):
    """  
    unpacks the sequence 
    """
    
    unpacked_predictions = []    
    unpacked_lables = []

    max_pooled = []

    idx = 0
    for packed_sequence,seq_iden,labels in zip(y,identifiers,true_lables):
        
        seq_nums , locations= np.unique(seq_iden,return_index=True) 
        locations = locations.tolist() 

        # print("seq_nums ",seq_nums)
        if seq_nums[0] == -1: # if there is any padding
            # print("padding found")
            padding_start_loc = locations.pop(0)

            pass
        else:
            # print("seq_iden.shape ",seq_iden.shape)
            padding_start_loc = seq_iden.shape[0]
            

        locations.append(padding_start_loc)

        # print("locations ",locations)
        # idx += 1
        # if idx == 10:
        #     break

        ## unpacking the sequence 

        for idx in range(len(locations)-1):
            example_unpacked = packed_sequence[locations[idx]:locations[idx+1],:]
            # print("unpack_seq.shape ",example_unpacked.shape)
            unpacked_predictions.append(example_unpacked)

            if max_pool == True:

                max_pooled.append(torch.max(example_unpacked,dim=0).values)

        ## unpacking the labels correspoding to the unpacked sequences
        unpacked_lables.extend(labels[locations[:-1]].tolist())

    if max_pool == True:
        return max_pooled,unpacked_lables

    return unpacked_predictions,unpacked_lables




In [25]:
output

tensor([[[ 4.5930e-01,  2.8274e-01, -1.1131e-01,  ...,  6.0622e-01,
           3.0650e-01,  2.6195e+00],
         [-6.1299e-01, -5.0991e-01, -7.9516e-01,  ...,  1.2648e+00,
           3.0891e-01,  2.3436e+00],
         [-7.9614e-01, -4.2468e-01, -1.3422e+00,  ...,  9.3737e-01,
           8.0326e-01,  2.3346e+00],
         ...,
         [ 1.5788e-01, -2.0893e+00,  3.3944e-01,  ..., -1.5054e+00,
           6.1906e-01,  1.4039e+00],
         [-4.7075e-01, -2.0760e+00, -2.7877e-01,  ..., -1.6566e+00,
          -2.4179e-01,  9.1007e-01],
         [ 1.2799e-01, -1.7113e+00, -3.9512e-01,  ..., -2.0863e+00,
          -2.9439e-01,  1.4370e+00]],

        [[ 9.7499e-01, -5.3609e-01, -2.3439e-01,  ..., -1.4338e+00,
          -9.0490e-01,  1.1090e+00],
         [ 1.3554e+00,  2.7740e-02,  9.1268e-02,  ..., -9.9386e-01,
          -5.8602e-01,  1.0366e+00],
         [ 1.4709e+00, -1.2734e-01,  4.1962e-01,  ..., -8.5259e-01,
          -9.7013e-02,  1.2469e+00],
         ...,
         [ 9.7218e-02, -2

In [26]:
idx = 0

x,y = unpack(y=output,true_lables=labels,identifiers=identifiers)
idx += 1

# if idx == 3:
# break

In [27]:
y

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]

In [28]:
for i in x:
    print(i.shape)

torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])


In [29]:
rand = torch.rand(80,64)
torch.max(rand,dim=0).values.shape

torch.Size([64])

In [30]:
len(x),len(y)

(20, 20)

In [31]:
for i in x:
    print(i.shape)

torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])
torch.Size([64])


In [32]:
labels[[1,2,3]]

tensor([[ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1],
        [ 0,  0,  0,  ..., -1, -1, -1]])

In [33]:
np.unique(identifiers[0],return_index=True,axis=0)

(array([-1.,  0.,  1.], dtype=float16), array([464,   0, 240]))

In [34]:
labels[5]

tensor([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1, 

In [35]:
output.shape

torch.Size([8, 480, 64])

In [36]:
import torch.nn as nn
import torch
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=True)
src_mask = torch.rand(32, 10, 10)

# Repeat src_mask for the number of attention heads
src_mask=torch.rand(32,10,10)
src_mask = src_mask.repeat_interleave(8, dim=0)
src = torch.rand(32,10 ,512)
out = encoder_layer(src,src_mask)

In [37]:
tenc(torch.rand(size=(8,300,64)),torch.rand(8,300,300)).shape

RuntimeError: The shape of the 3D attn_mask is torch.Size([8, 300, 300]), but should be (64, 300, 300).

In [44]:
from sklearn.model_selection import train_test_split

def random_split_tensors(split_ratio, *tensors, seed=None):


    assert all(tensor.size(0) == tensors[0].size(0) for tensor in tensors), "All tensors must have the same first dimension."
    for elem in tensors:
        print("type(elem) ",type(elem))

    if seed is not None:
        torch.manual_seed(seed)
    
    indices = list(range(tensors[0].size(0)))
    
    # Split the indices for train and validation based on split_ratio
    train_indices, val_indices = train_test_split(indices, train_size=split_ratio, random_state=seed)
    
    # Create train and validation splits for each tensor
    split1 = []
    split2 = []

    for tensor in tensors:
        train_split = tensor[train_indices]
        val_split = tensor[val_indices]
        
        split1.append(train_split)
        split2.append(val_split)
    
    return tuple((split1,split2))

# Example usage:
# tensor1 and tensor2 are example tensors with corresponding elements
tensor1 = torch.randn(100, 5)  # e.g., 100 samples, 5 features each
tensor2 = torch.randint(0, 2, (100,))  # e.g., 100 labels

train_val_splits = random_split_tensors(0.8, tensor1, tensor2, seed=42)
train_tensor1, val_tensor1 = train_val_splits[0]
train_tensor2, val_tensor2 = train_val_splits[1]

print(train_tensor1.shape, val_tensor1.shape)  # Train and val shapes for tensor1
print(train_tensor2.shape, val_tensor2.shape)  # Train and val shapes for tensor2



type(elem)  <class 'torch.Tensor'>
type(elem)  <class 'torch.Tensor'>
torch.Size([80, 5]) torch.Size([80])
torch.Size([20, 5]) torch.Size([20])
