In [None]:
# import all libraries
import torch
import torch.nn as nn
from torch.utils.data import random_split
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import matplotlib.pyplot as plt

import torchvision
import torchvision.transforms as transforms

import os
import argparse


In [None]:
# these are commonly used data augmentations
# random cropping and random horizontal flip
# lastly, we normalize each channel into zero mean and unit standard deviation
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform_test)

# we can use a larger batch size during test, because we do not save
# intermediate variables for gradient computation, which leaves more memory
testloader = torch.utils.data.DataLoader(
    testset, batch_size=256, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer',
           'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
# Get the size of the dataset
print(f"Training set size: {len(trainset)}")
print(f"Test set size: {len(testset)}")  # if you have testset defined

Training set size: 50000
Test set size: 10000




---



Train-Val split

In [None]:
# Split into train (40k) and validation (10k)
train_size = 40000
val_size = 10000
trainset, valset = random_split(trainset, [train_size, val_size])


# Check sizes
print(f"Training set: {len(trainset)}")      # 40,000
print(f"Validation set: {len(valset)}")      # 10,000
print(f"Test set: {len(testset)}")           # 10,000 - unchanged

# Create DataLoaders
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2)
valloader = torch.utils.data.DataLoader(
    valset, batch_size=128, shuffle=False, num_workers=2)

Training set: 40000
Validation set: 10000
Test set: 10000


In [None]:
import pickle

# Save the split datasets
print("Saving train/val/test datasets...")

# Save train dataset (the Subset object)
with open('./data/train_dataset.pkl', 'wb') as f:
    pickle.dump(trainset, f)
print(f"✅ Saved train_dataset.pkl: {len(trainset)} samples")

# Save val dataset (the Subset object)
with open('./data/val_dataset.pkl', 'wb') as f:
    pickle.dump(valset, f)
print(f"✅ Saved val_dataset.pkl: {len(valset)} samples")

# Save test dataset
with open('./data/test_dataset.pkl', 'wb') as f:
    pickle.dump(testset, f)
print(f"✅ Saved test_dataset.pkl: {len(testset)} samples")

print("\nNow downloading files...")

# Download the files
from google.colab import files
files.download('./data/train_dataset.pkl')
files.download('./data/val_dataset.pkl')
files.download('./data/test_dataset.pkl')

In [None]:
#reason - same train-val-test split needed for each experiment. each experiment is ran separately via GPU cluster so that the best params for each section can be chosen