## Load libraries

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import matplotlib.pyplot as plt
import torch

## Define data to build a dataset from

In [73]:
import mltools.dataset as dtools

class RandomDataset(torch.utils.data.Dataset):
    
    def __init__(self, n: int, m: int, N: int, device: str = 'cpu'):
        super().__init__()
        self._n = n
        self._m = m
        self._N = N
        self.device = device
        self.__make()
        
    def __make(self):
        self._X = torch.rand([self._N, self._n], device=self.device)
        self._Y = torch.rand([self._N, self._m], device=self.device)
        
    def __len__(self):
        return self._N

    def __getitem__(self, idx: int):
        return self._X[idx, :], self._Y[idx, :]       


unbatched_dset = RandomDataset(5, 2, 1000)
print(unbatched_dset)


# # unbatched_dataset = tf.data.Dataset.from_tensor_slices((tf.random.uniform([100, 2]), tf.random.uniform([100, 2])))
# unbatched_dataset = tf.data.Dataset.from_tensor_slices(tf.random.uniform([100, 2]))

# # Create a batched dataset
# batched_dataset = unbatched_dataset.batch(16)

# print(dtools.tf_is_dataset_batched(unbatched_dataset))
# print(dtools.tf_is_dataset_batched(batched_dataset))

# print(dtools.tf_get_random_element_from_dataset(unbatched_dataset))
# print(dtools.tf_get_random_element_from_dataset(batched_dataset))

<__main__.RandomDataset object at 0x0000017E5429BD60>


In [74]:
batch_size = 32
seed = 42
generator = torch.Generator().manual_seed(seed)
train_set, val_set = torch.utils.data.random_split(unbatched_dset, [0.8, 0.2], generator=generator)
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)

In [53]:
print(dtools.torch_is_dataloader_batched(val_loader))

True


In [71]:
elem = dtools.torch_get_random_element_from_dataloader(val_loader)
print(elem)

(tensor([0.1667, 0.6292, 0.4831, 0.2742, 0.7753]), tensor([0.7983, 0.7667]))


In [3]:
import mltools.dataset as dtools

# Example inputs and labels
length = 120
inputs = np.random.rand(length, 2, 2, 1)  # 100 images of size 32x32x3
labels = np.random.rand(length, 4)  # 100 labels (for a 10-class classification)

splitter_func = dtools.tf_make_datasets_from_tensor_slices
# splitter_func = dtools.tf_make_datasets_from_sklearn_arrays

train_dataset, val_dataset, test_dataset = splitter_func(
    inputs=inputs,
    outputs=labels,
    train_ratio=0.8,
    val_ratio=0.15,
    seed=42,
    batch_size=16,
    avoid_leakage=False,
)

Train size:  96
Validation size:  18
Test size:  6


In [7]:
import mltools.dataset as dtools

# Example inputs and labels
length = 120
inputs = np.random.rand(length, 2, 2, 1)  # 100 images of size 32x32x3
labels = np.random.rand(length, 4)  # 100 labels (for a 10-class classification)

# splitter_func = dtools.tf_make_datasets_from_tensor_slices
splitter_func = dtools.tf_make_datasets_from_sklearn_arrays

train_dataset, val_dataset, test_dataset = splitter_func(
    inputs=inputs,
    outputs=labels,
    train_ratio=0.8,
    val_ratio=0.15,
    seed=42,
    batch_size=None,
    # avoid_leakage=False,
)

print(
    dtools.tf_get_random_element_from_dataset(train_dataset)
)

Train size:  96
Validation size:  16
Test size:  8
(<tf.Tensor: shape=(2, 2, 1), dtype=float64, numpy=
array([[[0.95652186],
        [0.48453802]],

       [[0.59427153],
        [0.06836859]]])>, <tf.Tensor: shape=(4,), dtype=float64, numpy=array([0.28196447, 0.10134534, 0.30883358, 0.37356605])>)


In [8]:
print(
    dtools.tf_are_datasets_leaking(
    dict(
        train = train_dataset,
        val = val_dataset,
        test = test_dataset,
    )
)
)

No data leakage detected between train and val.
No data leakage detected between train and test.
No data leakage detected between val and test.
False


In [76]:
print(
    dtools.torch_are_datasets_leaking(
    dict(
        train = train_loader,
        val = val_loader,
    )
)
)

TypeError: unhashable type: 'list'

In [79]:
def get_all_elements(dataloader):
    all_elements = set()
    for batch in dataloader:
        # Assuming each batch is a tuple of (inputs, labels)
        inputs, labels = batch
        
        # Convert tensors to tuples for hashability
        inputs_tuple = tuple(tuple(input.cpu().numpy().flatten().tolist()) for input in inputs)
        labels_tuple = tuple(tuple(label.cpu().numpy().flatten().tolist()) for label in labels)
        
        # Add to set
        all_elements.update(zip(inputs_tuple, labels_tuple))
    
    return all_elements

# Extract elements from train and val loaders
train_elements = get_all_elements(train_loader)
val_elements = get_all_elements(val_loader)

# Check for intersection
intersection = train_elements.intersection(val_elements)

if len(intersection) > 0:
    print(f"Warning: Data leakage detected! {len(intersection)} elements are present in both train and validation sets.")
    print("Sample of overlapping elements:")
    for elem in list(intersection)[:5]:  # Print first 5 overlapping elements
        print(elem)
else:
    print("No data leakage detected. Train and validation sets are disjoint.")

# Calculate percentage of overlap
total_elements = len(train_elements) + len(val_elements)
overlap_percentage = len(intersection) / total_elements * 100

print(f"Percentage of overlapping elements: {overlap_percentage:.2f}%")


No data leakage detected. Train and validation sets are disjoint.
Percentage of overlapping elements: 0.00%
