# Test Preprocess

In [1]:
import numpy as np
import os

import fl_const
import fl_data
import fl_util

fl_util.initialize()

DATA_NAME = 'femnist' # mnist-o / mnist-f / femnist / celeba

trainData_by1Nid = fl_util.deserialize(os.path.join(fl_const.DATA_DIR_PATH, DATA_NAME, 'train'))

print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['x'].shape)
print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['y'].shape, np.unique(trainData_by1Nid[0]['y']))
numTotalClasses = len(np.unique(trainData_by1Nid[0]['y']))

print([ len(x['x']) for x in fl_data.groupByClass(trainData_by1Nid) ])

trainData_by1Nid_sampled = fl_data.sample(trainData_by1Nid, 1000)
print(len(trainData_by1Nid_sampled[0]['x']), len(trainData_by1Nid_sampled[0]['y']))

# Group Representation Test
NUM_NODES = 100
NUM_GROUPS = 10
z1 = fl_data.groupRandomly(NUM_NODES, NUM_GROUPS)
nids_byGid = fl_data.to_nids_byGid(z1)
z2 = fl_data.to_z(NUM_NODES, nids_byGid)
assert(z1 == z2)

2 (47011, 28, 28, 1)
2 (47011,) [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61]
[2320, 2628, 2334, 2459, 2350, 2162, 2382, 2440, 2385, 2287, 422, 250, 662, 307, 290, 608, 174, 207, 776, 261, 174, 356, 590, 557, 1632, 574, 188, 329, 1408, 583, 814, 321, 303, 209, 337, 191, 646, 337, 179, 649, 1639, 181, 261, 576, 179, 127, 144, 933, 150, 793, 185, 180, 200, 933, 214, 1130, 189, 191, 199, 174, 163, 189]
1000 1000


# Test Node Non-IID

In [2]:
NUM_NODES = 150
NUM_TOTAL_EXAMPLES = len(trainData_by1Nid[0]['y'])

def assert_groupByNode(dataByNid, numNodes_, numTotalExamples_, numClassesPerNode_):
    numNodes = len(dataByNid)
    numTotalExamples = sum( len(n['x']) for n in dataByNid )
    numClassesPerNode = np.mean([ len(np.unique(n['y'])) for n in dataByNid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print('%d %d %.2f %s %s %s' % (numNodes, numTotalExamples, numClassesPerNode, xType, xPixelType, yType))
    
    if DATA_NAME != 'celeba': # Due to the lack of number of classes in 'celeba'
        assert( abs((numClassesPerNode - numClassesPerNode_) / numClassesPerNode_) < 0.15 )
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numTotalExamples == numTotalExamples_ )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 't', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/10)
    
trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'q', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/4)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'h', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/2)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'a', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses)

150 47011 6.28 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 47011 15.51 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 47011 30.96 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
150 47011 55.03 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


# Test Node & Edge Non-IID

In [3]:
NUM_NODES = 100
NUM_EDGES = 10
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

def assert_groupByEdge(dataByNid, z, numNodes_, numEdges_, numNodesPerEdge_, numClassesPerNode_, numClassesPerEdge_):
    numNodes = len(dataByNid)
    numEdges = len(np.unique(z))
    nids_byGid = fl_data.to_nids_byGid(z)
    numNodesPerEdge = np.mean([ len(nids) for nids in nids_byGid ])
    numClassesPerNode = np.mean([ np.mean([ len(np.unique(dataByNid[nid]['y'])) for nid in nids ]) for nids in nids_byGid ])
    numClassesPerEdge = np.mean([ len(np.unique(np.concatenate([dataByNid[nid]['y'] for nid in nids]))) for nids in nids_byGid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print('%d %d %.2f %.2f %.2f %s %s %s' % (numNodes, numEdges, numNodesPerEdge, numClassesPerNode, numClassesPerEdge, xType, xPixelType, yType))
    if DATA_NAME != 'celeba': # Due to the lack of number of classes in 'celeba'
        assert( abs((numClassesPerNode - numClassesPerNode_) / numClassesPerNode_) < 0.2 )
        assert( abs((numClassesPerEdge - numClassesPerEdge_) / numClassesPerEdge_) < 0.2 )
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numEdges == numEdges_ )
    assert( numNodesPerEdge == numNodesPerEdge_ )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 't', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/10)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/4)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/2)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/4)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/2)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'a', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

100 10 10.00 5.96 6.30 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 6.30 15.40 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 6.25 31.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 6.24 62.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 15.09 15.50 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 15.47 31.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 15.49 62.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 30.59 31.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 30.92 62.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
100 10 10.00 58.37 59.80 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>


In [4]:
# NUM_NODES = 600
# NUM_EDGES = 20
# NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 't', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/10)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'q', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/4)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'h', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses/2)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 't', 'a', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/10, numTotalClasses)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'q', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/4)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'h', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/2)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'a', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'h', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'a', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

# (trainData_byNid, z_edge) = fl_data.groupByEdge(trainData_by1Nid, 'a', 'a', NUM_NODES, NUM_EDGES)
# assert_groupByEdge(trainData_byNid, z_edge, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)