In [88]:
import pandas as pd
import numpy as np
import pickle
from torch.utils.data import Dataset

BASEDIR_MIMIC = '/Volumes/ExternalData/Data/mimiciii/1.4'

In [89]:
class MimicDataSet(Dataset):
    def __init__(self, dataset='train_data'):
        self.dataset = dataset
        self.df = pickle.load(open(f'./full_{self.dataset}.pickle', 'rb'))
        self.inputs = self.df['inputs']
        self.labels = self.df['labels']
        self.notes = self.df['notes']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input = self.inputs[idx]
        label = self.labels[idx]
        note  = self.notes[idx]
        return input, label, note
        

In [90]:
train_data = MimicDataSet('train_data')
test_data = MimicDataSet('test_data')
val_data = MimicDataSet('val_data')

In [97]:
from torch.utils.data import DataLoader

batch_size = 32

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [98]:
train_features, train_labels, train_notes = next(iter(train_dataloader))
test_features, test_labels, test_notes = next(iter(test_dataloader))
val_features, val_labels, val_notes = next(iter(val_dataloader))

In [99]:
train_features.shape, train_labels.shape, len(train_notes)

(torch.Size([32, 48, 390]), torch.Size([32]), 32)

In [100]:
test_features.shape, test_labels.shape, len(test_notes)

(torch.Size([32, 48, 390]), torch.Size([32]), 32)

In [101]:
val_features.shape, val_labels.shape, len(val_notes)

(torch.Size([32, 48, 390]), torch.Size([32]), 32)

In [102]:
len(train_data), len(test_data), len(val_data)

(15018, 4711, 4695)

In [110]:
test_labels

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [132]:
s = 0
for train_features, train_labels, train_notes in train_dataloader:
    s += train_labels.sum()
    
print(f"Ratio labels distribution={s/len(train_data):0.2f}")

Ratio labels distribution=0.50


In [133]:
s = 0
for test_features, test_labels, test_notes in test_dataloader:
    s += test_labels.sum()
    
print(f"Ratio labels distribution={s/len(test_data):0.2f}")

Ratio labels distribution=0.20


In [134]:
s = 0
for val_features, val_labels, val_notes in val_dataloader:
    s += val_labels.sum()
    
print(f"Ratio labels distribution={s/len(val_data):0.2f}")

Ratio labels distribution=0.19
