# Getting Started with ncrsh DataLoader

This notebook provides a quick introduction to using the ncrsh DataLoader with various types of datasets.

In [None]:
# Import required libraries
import sys
import numpy as np
from pathlib import Path

# Add the project root to the Python path
sys.path.append(str(Path().absolute().parent.parent))

from ncrsh.data import DataLoader, Dataset, TensorDataset
from ncrsh.tensor import Tensor

## 1. Using a Custom Dataset

Let's create a simple custom dataset and use it with the DataLoader.

In [None]:
class CustomDataset(Dataset):
    def __init__(self, size=100, input_shape=(3, 32, 32), num_classes=10):
        self.size = size
        self.input_shape = input_shape
        self.data = np.random.randn(size, *input_shape).astype(np.float32)
        self.targets = np.random.randint(0, num_classes, size=size, dtype=np.int64)
    
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        return Tensor(self.data[idx]), Tensor([self.targets[idx]])

# Create dataset and dataloader
dataset = CustomDataset(size=1000)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Iterate through the dataloader
for batch_idx, (inputs, targets) in enumerate(dataloader):
    print(f"Batch {batch_idx}: inputs shape={inputs.shape}, targets shape={targets.shape}")
    if batch_idx == 2:  # Just show first 3 batches
        break

## 2. Using TensorDataset

For simple cases where you already have your data in arrays, you can use TensorDataset.

In [None]:
# Create sample data
x = np.random.randn(100, 3, 32, 32).astype(np.float32)
y = np.random.randint(0, 10, size=100, dtype=np.int64)

# Create dataset and dataloader
dataset = TensorDataset(Tensor(x), Tensor(y))
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Iterate through the dataloader
for batch_idx, (inputs, targets) in enumerate(dataloader):
    print(f"Batch {batch_idx}: inputs shape={inputs.shape}, targets shape={targets.shape}")
    if batch_idx == 2:  # Just show first 3 batches
        break

## 3. Using Multiple Workers

The DataLoader supports multi-process data loading for better performance.

In [None]:
# Create dataset and dataloader with multiple workers
dataset = CustomDataset(size=1000)
dataloader = DataLoader(
    dataset=dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2,  # Use 2 worker processes
    pin_memory=True  # Faster data transfer to CUDA devices
)

# Iterate through the dataloader
for batch_idx, (inputs, targets) in enumerate(dataloader):
    print(f"Batch {batch_idx}: inputs shape={inputs.shape}, targets shape={targets.shape}")
    if batch_idx == 2:  # Just show first 3 batches
        break