# Get the MNIST data and save it as separate serialized numpy arrays

In [1]:
TRAIN_IMAGES = "train-images.idx3-ubyte"
TRAIN_LABELS = "train-labels.idx1-ubyte"
TEST_IMAGES = "t10k-images.idx3-ubyte"
TEST_LABELS = "t10k-labels.idx1-ubyte"

### Load the data to disk

In [3]:
import idx2numpy
import numpy as np
import os

print(os.getcwd())

data_train = idx2numpy.convert_from_file(TRAIN_IMAGES)
# arr is now a np.ndarray type of object of shape 60000, 28, 28
labels_train = idx2numpy.convert_from_file(TRAIN_LABELS)
data_test = idx2numpy.convert_from_file(TEST_IMAGES)
labels_test = idx2numpy.convert_from_file(TEST_LABELS)

/home/xavier/PycharmProjects/mnist-overkill/experiments/raw_data


### Create CSV files for labels

In [4]:
import pandas as pd

# Create an empty dataframe with a single column 'label'
df_train = pd.DataFrame(columns=['label'])
df_train['label'] = labels_train
print(df_train)

df_test = pd.DataFrame(columns=['label'])
df_test['label'] = labels_test
print(df_test)


       label
0          5
1          0
2          4
3          1
4          9
...      ...
59995      8
59996      3
59997      5
59998      6
59999      8

[60000 rows x 1 columns]
      label
0         7
1         2
2         1
3         0
4         4
...     ...
9995      2
9996      3
9997      4
9998      5
9999      6

[10000 rows x 1 columns]


### Make 6 different subsets of the data, increasing by 10k each time

In [5]:
subsets = []
subset_lengths = [10000, 20000, 30000, 40000, 50000, 60000]
# Create subsets
for length in subset_lengths:
    subset = df_train[:length].copy()  # Slice the DataFrame to the desired length
    subsets.append(subset)

# Print the lengths of each subset
for i, subset in enumerate(subsets, 1):
    print(f"Subset {i}: Length = {len(subset)}")

print(subsets[0])

Subset 1: Length = 10000
Subset 2: Length = 20000
Subset 3: Length = 30000
Subset 4: Length = 40000
Subset 5: Length = 50000
Subset 6: Length = 60000
      label
0         5
1         0
2         4
3         1
4         9
...     ...
9995      5
9996      8
9997      6
9998      9
9999      7

[10000 rows x 1 columns]


### Write the images out to an output directory

In [6]:
import os

OUTPUT_DIR = os.path.join(os.path.expanduser('~'), 'MNIST')
output_dir_train = os.path.join(OUTPUT_DIR, 'train')
output_dir_test = os.path.join(OUTPUT_DIR, 'test')
print(output_dir_train)
print(output_dir_test)

/home/xavier/MNIST/train
/home/xavier/MNIST/test


In [8]:
import os

# train
for i, img in enumerate(data_train):
    savepath = os.path.join(output_dir_train, f'{i}.npy')
    np.save(savepath, img)

# test
for i, img in enumerate(data_test):
    savepath = os.path.join(output_dir_test, f'{i}.npy')
    np.save(savepath, img)

### Write the train labels subsets to CSV file

In [9]:
for i, df_subset in enumerate(subsets):
    savepath = os.path.join(OUTPUT_DIR, f'labels_{(i+1) * 10}k.csv')
    df_subset.to_csv(savepath)

### Write the test labels to CSV file

In [10]:
df_test.to_csv(os.path.join(OUTPUT_DIR, f'labels_test.csv'))