In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

In [2]:
print("Loading MNIST dataset...")
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

print(f"Original training data shape: {x_train.shape}")
print(f"Original training labels shape: {y_train.shape}")
print(f"Original test data shape: {x_test.shape}")
print(f"Original test labels shape: {y_test.shape}")
print(f"Data type: {x_train.dtype}")
print(f"Pixel value range: [{x_train.min()}, {x_train.max()}]")

Loading MNIST dataset...
Original training data shape: (60000, 28, 28)
Original training labels shape: (60000,)
Original test data shape: (10000, 28, 28)
Original test labels shape: (10000,)
Data type: uint8
Pixel value range: [0, 255]


In [3]:
# Normalize pixel values to [0, 1] range
x_train_norm = x_train.astype('float32') / 255.0
x_test_norm = x_test.astype('float32') / 255.0

In [4]:
# Combine training and test data for balanced splitting
x_combined = np.concatenate([x_train_norm, x_test_norm], axis=0)
y_combined = np.concatenate([y_train, y_test], axis=0)

print(f"Combined data shape: {x_combined.shape}")
print(f"Combined labels shape: {y_combined.shape}")

# Shuffle the combined data
from sklearn.utils import shuffle
x_combined, y_combined = shuffle(x_combined, y_combined, random_state=42)
print("Data shuffled successfully!")

Combined data shape: (70000, 28, 28)
Combined labels shape: (70000,)
Data shuffled successfully!


In [5]:

from sklearn.model_selection import StratifiedKFold

# Two equal, disjoint, stratified parts
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

parts_x, parts_y, fold_indices = [], [], []
for fold, (train_idx, test_idx) in enumerate(skf.split(x_combined, y_combined), start=1):
    parts_x.append(x_combined[test_idx])
    parts_y.append(y_combined[test_idx])
    fold_indices.append(test_idx)
    print(f"Part {fold} total samples: {parts_x[-1].shape[0]}")

# Optional: assign to named variables if you prefer
x_part1_all, y_part1_all = parts_x[0], parts_y[0]
x_part2_all, y_part2_all = parts_x[1], parts_y[1]

# Sanity check (coverage and no overlap)
concat_idx = np.concatenate(fold_indices)
print(f"Covered: {np.unique(concat_idx).size} of {x_combined.shape[0]} samples")


Part 1 total samples: 35000
Part 2 total samples: 35000
Covered: 70000 of 70000 samples


In [6]:
original_train_ratio = 60000 / (60000 + 10000)
test_ratio = 1 - original_train_ratio  # ~0.142857

x_parts_train, x_parts_test, y_parts_train, y_parts_test = [], [], [], []

for i, (px, py) in enumerate(zip(parts_x, parts_y), start=1):
    x_tr, x_te, y_tr, y_te = train_test_split(
        px, py,
        test_size=test_ratio,
        stratify=py,
        random_state=42  # keep reproducible
    )
    x_parts_train.append(x_tr); x_parts_test.append(x_te)
    y_parts_train.append(y_tr); y_parts_test.append(y_te)
    print(f"Part {i}: train={x_tr.shape[0]}, test={x_te.shape[0]}")

Part 1: train=29999, test=5001
Part 2: train=29999, test=5001


In [8]:
import os
save_dir = "mnist_split_data_2"
os.makedirs(save_dir, exist_ok=True)

for i in range(2):
    out_path = os.path.join(save_dir, f"mnist_part{i+1}.npz")
    np.savez_compressed(
        out_path,
        x_train=x_parts_train[i],
        y_train=y_parts_train[i],
        x_test=x_parts_test[i],
        y_test=y_parts_test[i],
    )
    size_mb = os.path.getsize(out_path) / (1024*1024)
    print(f"Saved Part {i+1} to {out_path} ({size_mb:.2f} MB)")

Saved Part 1 to mnist_split_data_2\mnist_part1.npz (8.78 MB)
Saved Part 2 to mnist_split_data_2\mnist_part2.npz (8.78 MB)
