In [1]:
import pandas as pd
import math

# Loading data
dataset = pd.read_csv('data/data.csv')

# get randomly training set
mec_dataset = dataset.sample(639)
mec_dataset

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,279,280,281,282,283,284,285,286,287,288
557,0,41,13,33,28,14,57,19,7,15,...,0,0,0,0,0,0,0,0,0,0
333,1,67,22,17,41,36,69,114,85,66,...,0,0,0,0,0,0,0,0,0,0
581,1,15,66,27,25,41,20,36,42,46,...,0,0,0,0,0,0,0,0,0,0
95,0,60,68,55,42,74,46,42,20,54,...,0,0,0,0,0,0,0,0,0,0
35,1,28,45,56,64,60,28,51,50,60,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
668,1,103,51,30,85,80,95,90,93,58,...,0,0,0,0,0,0,0,0,0,0
114,0,16,47,50,20,11,139,36,40,38,...,0,0,0,0,0,0,0,0,0,0
141,1,141,113,72,62,65,42,37,45,43,...,0,0,0,0,0,0,0,0,0,0
229,1,35,81,148,71,45,64,77,29,43,...,0,0,0,0,0,0,0,0,0,0


In [2]:
def mec_calculator(df, flag):
    # flag == True: calculate MEC
    # flag == False: calculate Capacity progression
    
    # amount of dataset
    num_rows = len(df)                
    
    # Initial
    # df_sum: 1-weighted sums of the sample features
    # df_sum[0]: sum, df_sum[1]: label
    # only few row has more than 48 column, truncate
    trunc_dim = 48 # number of dimension
    df_sum = pd.DataFrame()
    df_sum['d_sum'] = df.iloc[:,1:trunc_dim].sum(axis=1) #first column: label
    df_sum['label'] = df.iloc[:,0]
    
    # Sort table
    # df_sorted: sorted df_sum by the dimension sum column (d_sum)
    df_sorted = df_sum.sort_values(by='d_sum')
    
    # Count thresholds
    # iterate table from top to bottom
    cur_class = -1 
    threshold = 0
    for cur_label in df_sorted['label']:
        if cur_label != cur_class:
            cur_class = cur_label
            threshold += 1
    
    # Expected Capacity Requirement
    exp_mec = math.ceil(math.log(threshold + 1, 2) * trunc_dim) 
    
    if flag:
        # Calculate Max Capacity
        max_cap_req = threshold * (trunc_dim + 1) + threshold
        print("Dimensions:", trunc_dim)
        print("Amount of dataset:", num_rows)
        print("Number of Thresholds:", threshold, "bits")
        print("Max Capacity Requirement:", max_cap_req, "bits")
    
    print("MEC:", exp_mec, "bits")

In [4]:
mec_calculator(mec_dataset, True)

Dimensions: 48
Amount of dataset: 639
Number of Thresholds: 303 bits
Max Capacity Requirement: 15150 bits
MEC: 396 bits


In [5]:
sizes = [5, 10, 20, 40, 80, 90, 100]
sizes = [i for i in range(0,101,5)]
training_num = 639
for size in sizes:
    num = int(training_num * size/100)
    subset = mec_dataset.sample(num)
    mec_calculator(subset,False)

MEC: 0 bits
MEC: 197 bits
MEC: 247 bits
MEC: 267 bits
MEC: 291 bits
MEC: 309 bits
MEC: 312 bits
MEC: 326 bits
MEC: 330 bits
MEC: 341 bits
MEC: 351 bits
MEC: 362 bits
MEC: 361 bits
MEC: 369 bits
MEC: 370 bits
MEC: 378 bits
MEC: 381 bits
MEC: 387 bits
MEC: 387 bits
MEC: 392 bits
MEC: 396 bits
