# Dataset pre-processing for HPCC Systems
## Some datasets require some slight modification before they can be easliy sprayed onto an HPCC Systems Platform cluster.

### Fashion MNIST
Uses the same format as MNIST

In [1]:
# For Fashion MNIST
#strip the first bytes and combine the labels and data
#outputs 2 files, a test and training dataset
#the same code will work for regular MNIST too
import numpy as np


dir = 'fashion_mnist/'
testFiles = ['t10k-images-idx3-ubyte','t10k-labels-idx1-ubyte']
trainFiles = ['train-images-idx3-ubyte','train-labels-idx1-ubyte']

newTestFile = 'fashion_mnist_test_noheader'
newTrainFile = 'fashion_mnist_train_noheader'

In [2]:
def _read32(bytestream):
  dt = np.dtype(np.uint32).newbyteorder('>')
  return np.frombuffer(bytestream.read(4), dtype=dt)[0]

In [3]:
def readLabels(file):
    with open(file, 'rb') as f:
        magicNum = _read32(f)
        numItems = _read32(f)
        buf = f.read(numItems)
        labels = np.frombuffer(buf, dtype=np.uint8)
    return labels

def readImages(file):
    with open(file, 'rb') as f:
        magicNum = _read32(f)
        num_images = _read32(f)
        rows = _read32(f)
        cols = _read32(f)
        buf = f.read(rows * cols * num_images)
        data = np.frombuffer(buf, dtype=np.uint8)
        data = data.reshape(num_images, 784)
    return data

In [4]:
def writeNewFile(file, labels, images):
    with open(file, 'wb') as nf:
        for i in range(len(labels)):
            nf.write(labels[i])
            nf.write(images[i])

In [13]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

In [14]:
print(testLabels.shape)
print(trainLabels.shape)
print(testImages.shape)
print(trainImages.shape)


(10000,)
(60000,)
(10000, 784)
(60000, 784)


In [15]:
writeNewFile(dir+newTestFile, testLabels, testImages)
writeNewFile(dir+newTrainFile, trainLabels, trainImages)

### MNIST

In [17]:
dir = 'mnist/'
testFiles = ['t10k-images-idx3-ubyte','t10k-labels-idx1-ubyte']
trainFiles = ['train-images-idx3-ubyte','train-labels-idx1-ubyte']

newTestFile = 'mnist_test_noheader'
newTrainFile = 'mnist_train_noheader'

In [18]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

In [19]:
writeNewFile(dir+newTestFile, testLabels, testImages)
writeNewFile(dir+newTrainFile, trainLabels, trainImages)

### BIG MNIST
Makes MNIST dataset arbitrarily large for testing the memory limitations within HPCC/ECL code

In [10]:
dir = 'mnist/'
testFiles = ['t10k-images-idx3-ubyte','t10k-labels-idx1-ubyte']
trainFiles = ['train-images-idx3-ubyte','train-labels-idx1-ubyte']

newTestFile_big = 'mnist_test_noheader_big'
newTrainFile_big = 'mnist_train_noheader_big'

In [11]:
testLabels = readLabels(dir+testFiles[1])
trainLabels = readLabels(dir+trainFiles[1])
testImages = readImages(dir + testFiles[0])
trainImages = readImages(dir + trainFiles[0])

print('Base Shapes: ', testLabels.shape, trainLabels.shape, testImages.shape, trainImages.shape)

Base Shapes:  (10000,) (60000,) (10000, 784) (60000, 784)


In [12]:
testLabels_big = testLabels
trainLabels_big = trainLabels
testImages_big = testImages
trainImages_big = trainImages

# multiplier of 100 produces roughly 750 mb test file and 4.4 GB train file
multiplier = 9 
for _ in range(multiplier - 1):
    testLabels_big = np.append(testLabels_big, testLabels)
    trainLabels_big = np.append(trainLabels_big, trainLabels)
    testImages_big = np.append(testImages_big, testImages, axis=0)
    trainImages_big = np.append(trainImages_big, trainImages, axis=0)

In [13]:
print('Final shapes after resize: ', testLabels_big.shape, trainLabels_big.shape, testImages_big.shape, trainImages_big.shape)

Final shapes after resize:  (90000,) (540000,) (90000, 784) (540000, 784)


In [14]:
writeNewFile(dir+newTestFile_big, testLabels_big, testImages_big)
writeNewFile(dir+newTrainFile_big, trainLabels_big, trainImages_big)

del testLabels_big, trainLabels_big, testImages_big, trainImages_big
print('MNIST_BIG files created')

MNIST_BIG files created


### IMDB Sentiment | Binary classification

In [113]:
imdbpath = 'imdb/imdb.npz'
with np.load(imdbpath, allow_pickle=True) as f:
        x_train, labels_train = f['x_train'], f['y_train']
        x_test, labels_test = f['x_test'], f['y_test']

In [59]:
print(len(x_test))
print(len(x_test[0]))

print(len(labels_test)) #binary classification


25000
152
25000


array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [115]:
import csv

def imdbCSV(file, label, data):
    with open(file, 'w', newline='\n') as myfile:
        for i in range(len(data)):
            wr = csv.writer(myfile)
            data[i].insert(0, label[i])
            wr.writerow(data[i])

In [116]:
imdbCSV('imdb/imdb_train.csv', labels_train, x_train)
imdbCSV('imdb/imdb_test.csv', labels_test, x_test)

### Boston Housing Dataset 

In [200]:
bhpath = 'boston_housing/boston_housing.npz'
test_split = 0.2

In [201]:
with np.load(bhpath, allow_pickle=True) as f:
    x = f['x']
    y = f['y']

x_train = np.array(x[:int(len(x) * (1 - test_split))])
y_train = np.array(y[:int(len(x) * (1 - test_split))])
x_test = np.array(x[int(len(x) * (1 - test_split)):])
y_test = np.array(y[int(len(x) * (1 - test_split)):])

In [202]:
def bhCSV(file, data):
    with open(file, 'w', newline='\n') as f:
        for i in range(len(data)):
            wr = csv.writer(f)
            wr.writerow(data[i])

In [203]:
test = np.column_stack([y_test, x_test])
train = np.column_stack([y_train, x_train])

In [None]:
bhCSV('boston_housing/boston_housing_test.csv',test)
bhCSV('boston_housing/boston_housing_train.csv',train)

### Reuters Dataset

In [221]:
rpath = 'reuters/reuters.npz'
test_split = 0.2

In [222]:
with np.load(rpath, allow_pickle=True) as f:
        xs, labels = f['x'], f['y']

idx = int(len(xs) * (1 - test_split))
x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])
print(y_test)

[19  8  3 ... 13 30 25]


In [223]:
def reutersCSV(file, label, data):
    with open(file, 'w', newline='\n') as myfile:
        for i in range(len(data)):
            wr = csv.writer(myfile)
            data[i].insert(0, label[i])
            wr.writerow(data[i])

In [224]:
reutersCSV('reuters/reuters_test.csv', y_test, x_test)
reutersCSV('reuters/reuters_train.csv', y_train, x_train)