# Test Preprocess & Repeat

In [1]:
import numpy as np
import os

import fl_data
import fl_util

DATA_NAME = 'celeba' # mnist-o / mnist-f / femnist / celeba

trainData_by1Nid = fl_util.deserialize(os.path.join('..', 'data', DATA_NAME, 'train'))

print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['x'].shape)
print(len(trainData_by1Nid[0]), trainData_by1Nid[0]['y'].shape, np.unique(trainData_by1Nid[0]['y']))
numTotalClasses = len(np.unique(trainData_by1Nid[0]['y']))

print([ len(x['x']) for x in fl_data.groupByClass(trainData_by1Nid) ])

trainData_by1Nid_sampled = fl_data.sample(trainData_by1Nid, 1000)
print(len(trainData_by1Nid_sampled[0]['x']), len(trainData_by1Nid_sampled[0]['y']))

2 (6008, 84, 84, 3)
2 (6008,) [0 1]
[3099, 2909]
1000 1000


# Test Node Non-IID

In [2]:
NUM_NODES = 150
NUM_TOTAL_EXAMPLES = len(trainData_by1Nid[0]['y'])

def assert_groupByNode(dataByNid, numNodes_, numTotalExamples_, numClassesPerNode_):
    numNodes = len(dataByNid)
    numTotalExamples = sum( len(n['x']) for n in dataByNid )
    numClassesPerNode = np.mean([ len(np.unique(n['y'])) for n in dataByNid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print('%d %d %.2f %s %s %s' % (numNodes, numTotalExamples, numClassesPerNode, xType, xPixelType, yType))
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numTotalExamples == numTotalExamples_ )
#     assert( abs((numClassesPerNode - numClassesPerNode_) / numClassesPerNode_) < 0.1 )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'o', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, 1)
    
trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'q', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/4)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'h', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses/2)

trainData_byNid = fl_data.groupByNode(trainData_by1Nid, 'a', NUM_NODES)
assert_groupByNode(trainData_byNid, NUM_NODES, NUM_TOTAL_EXAMPLES, numTotalClasses)

1 2
150 6008 1.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
1.0 2
150 6008 1.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
1.0 2


KeyboardInterrupt: 

# Test Node & Edge Non-IID

In [4]:
NUM_NODES = 400
NUM_EDGES = 20
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

def assert_groupByEdge(dataByNid, z, numNodes_, numEdges_, numNodesPerEdge_, numClassesPerNode_, numClassesPerEdge_):
    numNodes = len(dataByNid)
    numEdges = len(np.unique(z))
    nids_byGid = fl_data.to_nids_byGid(z)
    numNodesPerEdge = np.mean([ len(nids) for nids in nids_byGid ])
    numClassesPerNode = np.mean([ np.mean([ len(np.unique(dataByNid[nid]['y'])) for nid in nids ]) for nids in nids_byGid ])
    numClassesPerEdge = np.mean([ len(np.unique(np.concatenate([dataByNid[nid]['y'] for nid in nids]))) for nids in nids_byGid ])
    xType = type(dataByNid[0]['x'][0])
    xPixelType = type(dataByNid[0]['x'][0].flatten()[0])
    yType = type(dataByNid[0]['y'][0].flatten()[0])
    print('%d %d %.2f %.2f %.2f %s %s %s' % (numNodes, numEdges, numNodesPerEdge, numClassesPerNode, numClassesPerEdge, xType, xPixelType, yType))
    assert( np.all([ len(n['x']) > 1 for n in dataByNid ]) )
    assert( numNodes == numNodes_ )
    assert( numEdges == numEdges_ )
    assert( numNodesPerEdge == numNodesPerEdge_ )
#     assert( abs((numClassesPerNode - numClassesPerNode_) / numClassesPerNode_) < 0.2 )
#     assert( abs((numClassesPerEdge - numClassesPerEdge_) / numClassesPerEdge_) < 0.2 )
    assert( xType == np.ndarray )
    assert( xPixelType == np.float32 )
    assert( yType == np.int32 )

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'o', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/4)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/4)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'a', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)

1 1
400 20 20.00 1.00 1.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
1 1.0
400 20 20.00 1.00 1.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
1 1.0
400 20 20.00 1.00 1.00 <class 'numpy.ndarray'> <class 'numpy.float32'> <class 'numpy.int32'>
1 2


ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
NUM_NODES = 600
NUM_EDGES = 20
NUM_NODES_PER_EDGE = NUM_NODES / NUM_EDGES

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'o', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, 1)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/4)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'o', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, 1, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'q', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/4)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'q', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/4, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'h', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses/2)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'h', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses/2, numTotalClasses)

(trainData_byNid, train_z) = fl_data.groupByEdge(trainData_by1Nid, 'a', 'a', NUM_NODES, NUM_EDGES)
assert_groupByEdge(trainData_byNid, train_z, NUM_NODES, NUM_EDGES, NUM_NODES_PER_EDGE, numTotalClasses, numTotalClasses)