# Test Preprocess & Repeat

In [1]:
import numpy as np
import os

import fl_data
import fl_util

MODEL_NAME = 'sr'
DATA_NAME = 'mnist-o' # mnist-o / mnist-f / cifar10

trainData_by1Nid = fl_util.deserialize(os.path.join('..', 'data', DATA_NAME, 'train'))

print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['x'].shape)
print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['y'].shape, np.unique(trainData_by1Nid[0]['y']))
numTotalClasses = len(np.unique(trainData_by1Nid[0]['y']))

print([ len(x['x']) for x in fl_data.groupByClass(trainData_by1Nid) ])

trainData_by1Nid_sampled = fl_data.sample(trainData_by1Nid, 1000)
print(len(trainData_by1Nid_sampled[0]['x']), len(trainData_by1Nid_sampled[0]['y']))

2 (42000, 28, 28)
2 (42000,) [0 1 2 3 4 5 6 7 8 9]
[4206, 4700, 4182, 4271, 4068, 3791, 4160, 4360, 4127, 4135]
1000 1000


# Test Node Non-IID

In [2]:
NUM_NODES = 150
NUM_TOTAL_EXAMPLES = len(trainData_by1Nid[0]['y'])

def assert_groupByNode(dataByNid, numNodes_, numTotalExamples_, numClassesPerNode_):
    numNodes = len(dataByNid)
    numTotalExamples = sum( len(n['x']) for n in dataByNid )
    numClassesPerNode = np.mean([ len(np.unique(n['y'])) for n in dataByNid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print(numNodes, numTotalExamples, numClassesPerNode, xType, xPixelType, yType)
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numTotalExamples == numTotalExamples_ )
#     assert( numClassesPerNode == numClassesPerNode_ ) # numClassesPerNode is stochastic
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 2, NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, 1)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 4, NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/5)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 6, NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/2)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 8, NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses)

150 42000 2.1533333333333333 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 42000 3.96 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 42000 5.96 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 42000 7.94 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


# Test Node & Edge Non-IID

In [3]:
NUM_NODES = 100
NUM_EDGES = 10
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

def assert_groupByEdge(dataByNid, z, numNodes_, numEdges_, numNodesPerEdge_, numClassesPerNode_, numClassesPerEdge_):
    numNodes = len(dataByNid)
    numEdges = len(np.unique(z))
    nids_byGid = fl_data.to_nids_byGid(z)
    numNodesPerEdge = np.mean([ len(nids) for nids in nids_byGid ])
    numClassesPerNode = np.mean([ np.mean([ len(np.unique(dataByNid[nid]['y'])) for nid in nids ]) for nids in nids_byGid ])
    numClassesPerEdge = np.mean([ len(np.unique(np.concatenate([dataByNid[nid]['y'] for nid in nids]))) for nids in nids_byGid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print(numNodes, numEdges, numNodesPerEdge, numClassesPerNode, numClassesPerEdge, xType, xPixelType, yType)
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numEdges == numEdges_ )
    assert( numNodesPerEdge == numNodesPerEdge_ )
#     assert( numClassesPerNode == numClassesPerNode_ ) # numClassesPerNode is stochastic
#     assert( numClassesPerEdge == numClassesPerEdge_ )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 2, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/5)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 3, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, numTotalClasses/5)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, 6)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 6, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 8, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

100 10 10.0 1.77 2.0 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 2.2 4.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 2.8899999999999997 4.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 3.65 4.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 2.2199999999999998 6.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 4.17 6.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 5.8100000000000005 6.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 2.29 8.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 4.15 8.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.0 7.7 8.1 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


In [None]:
NUM_NODES = 600
NUM_EDGES = 20
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 2, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/5)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 3, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 4, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, numTotalClasses/5)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, 6)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 6, 6, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/5, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 2, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 4, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 8, 8, NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

600 20 30.0 1.7449999999999999 2.05 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 2.1866666666666665 4.05 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 2.9233333333333333 4.05 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 3.633333333333333 4.05 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 2.1566666666666667 5.9 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 3.893333333333333 5.9 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
600 20 30.0 5.525 5.9 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
