# Test Preprocess & Repeat

In [3]:
import numpy as np
import tensorflow as tf
import hfl_util

MODEL_NAME = 'sr'
DATA_NAME = 'mnist-o' # mnist-o / mnist-f / cifar10

if MODEL_NAME == 'svm':
    numTotalClasses = 2
else:
    numTotalClasses = 10
    
if DATA_NAME == 'mnist-o':
    trainData, testData = tf.keras.datasets.mnist.load_data()
elif DATA_NAME == 'mnist-f':
    trainData, testData = tf.keras.datasets.fashion_mnist.load_data()
elif DATA_NAME == 'cifar10':
    trainData, testData = tf.keras.datasets.cifar10.load_data()
else:
    raise Exception(DATA_NAME)
print(len(trainData[0]), len(trainData[1]))
    
if MODEL_NAME == 'cnn':
    flatten = False
else:
    flatten = True
    
trainData_by1Nid = hfl_util.preprocess(MODEL_NAME, DATA_NAME, trainData, flatten)
print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['x'].shape)
print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['y'].shape, np.unique(trainData_by1Nid[0]['y']))

trainData_by1Nid_sampled = hfl_util.sample(trainData_by1Nid, 1000)
print(len(trainData_by1Nid_sampled[0]['x']), len(trainData_by1Nid_sampled[0]['y']))

60000 60000
2 (60000, 784)
2 (60000,) [0 1 2 3 4 5 6 7 8 9]
1000 1000


# Test Node Non-IID

In [2]:
NUM_NODES = 150
NUM_TOTAL_SAMPLES = len(trainData_by1Nid[0]['y'])

def assert_groupByNode(dataByNid, numNodes_, numTotalSamples_, numClassesPerNode_):
    numNodes = len(dataByNid)
    numTotalSamples = sum( len(n['y']) for n in dataByNid )
    numClassesPerNode = np.mean([ len(np.unique(n['y'])) for n in dataByNid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print(numNodes, numTotalSamples, numClassesPerNode, xType, xPixelType, yType)
    assert( numNodes == numNodes_ )
    assert( numTotalSamples == numTotalSamples_ )
    assert( numClassesPerNode == numClassesPerNode_ )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

trainData_byNid = hfl_util.groupByNode(trainData_by1Nid, 'one', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_SAMPLES, 1)

trainData_byNid = hfl_util.groupByNode(trainData_by1Nid, 'half', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_SAMPLES, numTotalClasses/2)

trainData_byNid = hfl_util.groupByNode(trainData_by1Nid, 'all', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_SAMPLES, numTotalClasses)

150 50000 1.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 50000 5.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 50000 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


# Test Node & Edge Non-IID

In [3]:
NUM_NODES = 400
NUM_EDGES = 20
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

def assert_groupByEdge(dataByNid, z, numNodes_, numEdges_, numNodesPerEdge_, numClassesPerNode_, numClassesPerEdge_):
    numNodes = len(dataByNid)
    numEdges = len(np.unique(z))
    nids_byGid = hfl_util.to_nids_byGid(z)
    numNodesPerEdge = np.mean([ len(nids) for nids in nids_byGid ])
    numClassesPerNode = np.mean([ np.mean([ len(np.unique(dataByNid[nid]['y'])) for nid in nids ]) for nids in nids_byGid ])
    numClassesPerEdge = np.mean([ len(np.unique(np.concatenate([dataByNid[nid]['y'] for nid in nids]))) for nids in nids_byGid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print(numNodes, numEdges, numNodesPerEdge, numClassesPerNode, numClassesPerEdge, xType, xPixelType, yType)
    assert( numNodes == numNodes_ )
    assert( numEdges == numEdges_ )
    assert( numNodesPerEdge == numNodesPerEdge_ )
    assert( numClassesPerNode == numClassesPerNode_ )
    assert( numClassesPerEdge == numClassesPerEdge_ )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'one', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'half', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'half', 'half', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'half', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'all', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

400 20 20.0 1.0 1.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
400 20 20.0 1.0 5.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
400 20 20.0 1.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
400 20 20.0 5.0 5.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
400 20 20.0 5.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
400 20 20.0 10.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


In [4]:
NUM_NODES = 600
NUM_EDGES = 20
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'one', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'half', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'one', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'half', 'half', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'half', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = hfl_util.groupByEdge(MODEL_NAME, DATA_NAME, trainData, 'all', 'all', NUM_NODES, NUM_EDGES, flatten)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

600 20 30.0 1.0 1.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 1.0 5.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 1.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 5.0 5.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 5.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 10.0 10.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
