In [31]:
import numpy as np
import os

In [32]:
# Load all 5 split data files
train_data_list = []
train_labels_list = []
test_data_list = []
test_labels_list = []

for i in range(1, 6):
    data = np.load(f"./data/5_split_data/mnist_part{i}.npz")
    train_data_list.append(data['x_train'])
    train_labels_list.append(data['y_train'])
    test_data_list.append(data['x_test'])
    test_labels_list.append(data['y_test'])
    print(f"Part {i}: Train shape = {data['x_train'].shape}, Test shape = {data['x_test'].shape}")

Part 1: Train shape = (11999, 28, 28), Test shape = (2001, 28, 28)
Part 2: Train shape = (11999, 28, 28), Test shape = (2001, 28, 28)
Part 3: Train shape = (11999, 28, 28), Test shape = (2001, 28, 28)
Part 4: Train shape = (11999, 28, 28), Test shape = (2001, 28, 28)
Part 5: Train shape = (11999, 28, 28), Test shape = (2001, 28, 28)


In [33]:
# Concatenate all training data
all_train_data = np.concatenate(train_data_list, axis=0)
all_train_labels = np.concatenate(train_labels_list, axis=0)

print(f"Total training samples: {all_train_data.shape[0]}")
print(f"Total training labels: {all_train_labels.shape[0]}")

Total training samples: 59995
Total training labels: 59995


In [34]:
# Shuffle the training data
indices = np.random.permutation(len(all_train_data))
shuffled_train_data = all_train_data[indices]
shuffled_train_labels = all_train_labels[indices]

print("Training data shuffled!")

Training data shuffled!


In [35]:
# Split the shuffled data back into 5 equal parts
samples_per_client = len(shuffled_train_data) // 5

new_train_data = []
new_train_labels = []

for i in range(5):
    start_idx = i * samples_per_client
    if i == 4:  # Last client gets remaining samples
        end_idx = len(shuffled_train_data)
    else:
        end_idx = (i + 1) * samples_per_client
    
    new_train_data.append(shuffled_train_data[start_idx:end_idx])
    new_train_labels.append(shuffled_train_labels[start_idx:end_idx])
    print(f"Client {i+1}: {new_train_data[i].shape[0]} training samples")

Client 1: 11999 training samples
Client 2: 11999 training samples
Client 3: 11999 training samples
Client 4: 11999 training samples
Client 5: 11999 training samples


In [36]:
# Create directory for shuffled data
output_dir = 'data/5_split_shuffled4'
os.makedirs(output_dir, exist_ok=True)
print(f"Created directory: {output_dir}")

Created directory: data/5_split_shuffled4


In [37]:
# Save each part with shuffled training data and original test data
for i in range(5):
    np.savez(
        f'{output_dir}/mnist_part{i+1}.npz',
        x_train=new_train_data[i],
        y_train=new_train_labels[i],
        x_test=test_data_list[i],  # Keep original test data
        y_test=test_labels_list[i]  # Keep original test labels
    )
    print(f"Saved shuffled mnist_part{i+1}.npz")

print("\nAll files saved successfully!")

Saved shuffled mnist_part1.npz
Saved shuffled mnist_part2.npz
Saved shuffled mnist_part3.npz
Saved shuffled mnist_part4.npz
Saved shuffled mnist_part5.npz

All files saved successfully!


In [38]:
# Verify by loading and checking one file
verify_data = np.load(f'{output_dir}/mnist_part1.npz')
print("\nVerification:")
print(f"Part 1 - Train: {verify_data['x_train'].shape}, Test: {verify_data['x_test'].shape}")
print(f"First 10 training labels: {verify_data['y_train'][:10]}")
print(f"First 10 test labels: {verify_data['y_test'][:10]}")


Verification:
Part 1 - Train: (11999, 28, 28), Test: (2001, 28, 28)
First 10 training labels: [1 5 5 0 3 5 1 7 4 8]
First 10 test labels: [3 1 1 2 9 8 6 3 1 8]
