In [69]:
from utils import prepare_jupyter
prepare_jupyter()

import os
import pandas as pd
import numpy as np

from thesis.io import read_csv, NAMES

DATA_DIR_BASE = '/Users/tomek/University/MgrThesis/EnsembleDiversityResults'
base_path = os.path.join(DATA_DIR_BASE, 'diversity-22-09')

COLUMNS = [
    # STRUCT.
    'node_diversity',    
    'used_attributes_ratio',
    'avg_node_count',
    'avg_attributes_used',
    # BEHAV.
    'corr',
    'df',
    'entropy',
    'kw',
    'q',
    'coverage_minmax',
    'coverage_std'
]

In [70]:
def save_table_csv(table_df, name, pretty=False):
    table_path = f'/Users/tomek/University/MgrThesis/EnsembleDiversityResults/tables-15-09/{name}.csv'
    table_to_save = table_df.copy()
    
    if pretty:
        index_name_map = {
            'dataset_name': 'Dataset'
        }
        
        column_name_map = {
            'used_attributes_ratio': 'Used attr. ratio',
            'node_diversity': 'Node diversity',
            'corr': 'Correlation',
            'entropy': 'Entropy',
            'kw': 'Kohavi-Wolpert variance',
            'q': 'Q-statistic',
            'coverage_minmax': 'Coverage (minmax)',
            'coverage_std': 'Coverage (std. dev.)'
        }
        
        table_to_save = table_to_save.rename(columns=column_name_map).rename(index=index_name_map)
        
    table_to_save.to_csv(table_path)

def get_table(df, ensemble_name):
    results = {}
    
    groups = df[df['name'] == ensemble_name].groupby(by=['dataset_name'])   
    
    for group_id, group_df in groups:
        dataset_name = group_id
        
        if dataset_name not in results:
            results[dataset_name] = []
        
        for column in COLUMNS:
            values = group_df[[column, 'accuracy']]
            corr_coeff = values.corr().iat[0, 1] # Or [1, 0] - doesn't matter
            results[dataset_name].append(corr_coeff)    
    
    table = np.array([[dataset_name, *results] for dataset_name, results in results.items()])
    table_df = pd.DataFrame(table, columns=['dataset_name', *COLUMNS]).set_index('dataset_name').astype(np.float32)
    
    return table_df.round(3)

## Tabele (średnie)

1. Średnie wyniki dla każdej miary (wiersz to zbiór danych, kolumna to nazwa klas.)
2. ...

In [96]:
def df_from_groups(groups, with_avg=True):
    column_set = ['index']
    results = {}
    
    for (dataset_name, name), value in groups.items():
        if name not in column_set:
            column_set.append(name)
        
        if dataset_name not in results:
            results[dataset_name] = []
        
        results[dataset_name].append(value)
    
    
    results = np.array([[dataset_name, *values] for dataset_name, values in results.items()])

    if with_avg:
        column_avgs = np.array([['AVERAGE', *np.mean(results[:,1:].astype(np.float), axis=0)]])
        results = np.concatenate((results, column_avgs), axis=0)

    df = pd.DataFrame(results, columns=column_set)
    df = df.astype({'Adaboost': np.float32,
                    'Bagging': np.float32,
                    'CatboostEnsemble': np.float32,
                    'LightGBM': np.float32,
                    'RandomForest': np.float32,
                    'XGBoostEnsemble': np.float32})
    return df


def get_avg_table(measure):
    df_path = os.path.join(base_path, 'allexperiment-ensemble.csv')
    all_df = pd.read_csv(df_path, index_col=0)
    all_df.head(n=2)

    groups = all_df.groupby(by=['dataset_name', 'name'])
    results = {}

    for group_name, group_df in groups:
        values = group_df[measure].values
        results[group_name] = np.mean(values)

    return df_from_groups(results)

In [106]:
pd.options.display.float_format = '{:,.3f}'.format
get_avg_table('node_diversity')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,6.706,4.535,13.029,1.19,4.894,3.177
1,boston,6.147,4.138,0.883,3.819,5.14,5.111
2,cmc,7.714,4.71,2.818,3.83,7.42,6.218
3,flags,7.247,5.257,2.89,0.818,6.775,3.328
4,glass,5.6,3.967,0.665,1.091,4.824,3.272
5,ionosphere,6.186,3.721,0.0,1.991,5.986,3.333
6,isolet,0.0,1.752,0.334,2.84,5.75,1.467
7,lymph,4.882,3.766,11.405,0.94,5.295,2.79
8,oil_spill,4.642,3.533,0.0,3.693,5.066,4.625
9,pollution,0.0,2.152,5.163,0.0,2.929,1.618


In [107]:
get_avg_table('used_attributes_ratio')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.994,0.992,0.972,0.848,0.999,0.9
1,boston,0.981,0.986,0.978,0.892,0.995,0.923
2,cmc,1.0,0.998,1.0,0.963,1.0,0.985
3,flags,0.974,0.962,0.901,0.614,0.984,0.824
4,glass,0.999,0.996,1.0,0.956,0.999,0.941
5,ionosphere,0.942,0.893,0.894,0.841,0.953,0.802
6,isolet,0.008,0.118,0.23,0.174,0.357,0.045
7,lymph,0.979,0.972,0.892,0.648,0.993,0.789
8,oil_spill,0.897,0.847,0.837,0.755,0.919,0.676
9,pollution,0.348,0.959,0.969,0.449,0.995,0.72


In [108]:
get_avg_table('corr')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.237,0.461,0.155,0.302,0.437,0.406
1,boston,0.154,0.453,0.211,0.261,0.33,0.319
2,cmc,0.045,0.539,0.113,0.285,0.43,0.337
3,flags,0.125,0.289,0.058,0.224,0.219,0.297
4,glass,0.154,0.383,0.167,0.178,0.303,0.316
5,ionosphere,0.181,0.498,0.118,0.245,0.325,0.358
6,isolet,0.0,0.486,0.07,0.411,0.243,0.409
7,lymph,0.169,0.312,0.182,0.197,0.212,0.278
8,oil_spill,0.279,0.596,0.13,0.259,0.501,0.485
9,pollution,0.0,0.26,0.176,0.228,0.192,0.391


In [109]:
get_avg_table('df')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.152,0.171,0.164,0.169,0.169,0.169
1,boston,0.119,0.108,0.109,0.125,0.11,0.127
2,cmc,0.218,0.286,0.223,0.22,0.296,0.248
3,flags,0.195,0.186,0.196,0.204,0.186,0.21
4,glass,0.12,0.141,0.141,0.162,0.132,0.146
5,ionosphere,0.073,0.074,0.089,0.081,0.064,0.081
6,isolet,0.0,0.019,0.058,0.018,0.016,0.027
7,lymph,0.096,0.101,0.103,0.14,0.106,0.105
8,oil_spill,0.049,0.043,0.078,0.097,0.043,0.052
9,pollution,0.0,0.138,0.14,0.156,0.139,0.19


In [110]:
get_avg_table('entropy')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.507,0.295,0.566,0.454,0.297,0.359
1,boston,0.513,0.25,0.455,0.453,0.335,0.4
2,cmc,0.826,0.276,0.653,0.51,0.352,0.457
3,flags,0.64,0.472,0.724,0.559,0.533,0.494
4,glass,0.502,0.329,0.531,0.556,0.379,0.415
5,ionosphere,0.358,0.161,0.476,0.36,0.225,0.291
6,isolet,0.0,0.044,0.459,0.078,0.094,0.131
7,lymph,0.441,0.329,0.453,0.518,0.426,0.383
8,oil_spill,0.259,0.095,0.504,0.392,0.146,0.192
9,pollution,0.0,0.442,0.531,0.518,0.501,0.39


In [111]:
get_avg_table('kw')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.16,0.104,0.189,0.183,0.107,0.161
1,boston,0.098,0.047,0.09,0.097,0.064,0.089
2,cmc,0.043,0.018,0.039,0.036,0.022,0.034
3,flags,0.291,0.218,0.31,0.277,0.245,0.264
4,glass,0.224,0.147,0.224,0.251,0.169,0.212
5,ionosphere,0.095,0.045,0.128,0.115,0.064,0.103
6,isolet,0.0,0.008,0.078,0.019,0.017,0.036
7,lymph,0.288,0.211,0.282,0.343,0.27,0.286
8,oil_spill,0.03,0.01,0.053,0.045,0.017,0.028
9,pollution,0.0,0.666,0.791,0.805,0.759,0.719


In [112]:
get_avg_table('coverage_minmax')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.065,0.047,0.074,0.033,0.048,0.075
1,boston,0.08,0.059,0.123,0.099,0.079,0.064
2,cmc,0.209,0.072,0.144,0.137,0.094,0.15
3,flags,0.068,0.052,0.126,0.018,0.073,0.037
4,glass,0.073,0.088,0.11,0.028,0.086,0.076
5,ionosphere,0.1,0.051,0.102,0.081,0.101,0.097
6,isolet,0.0,0.02,0.17,0.037,0.037,0.072
7,lymph,0.064,0.048,0.087,0.028,0.063,0.062
8,oil_spill,0.144,0.054,0.145,0.107,0.139,0.116
9,pollution,0.0,0.06,0.085,0.049,0.073,0.068


In [113]:
get_avg_table('coverage_std')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.018,0.013,0.029,0.011,0.013,0.025
1,boston,0.017,0.014,0.019,0.032,0.017,0.017
2,cmc,0.054,0.014,0.026,0.039,0.018,0.038
3,flags,0.016,0.014,0.025,0.007,0.021,0.013
4,glass,0.015,0.019,0.016,0.011,0.02,0.019
5,ionosphere,0.026,0.024,0.015,0.029,0.037,0.031
6,isolet,0.0,0.013,0.027,0.022,0.032,0.037
7,lymph,0.014,0.013,0.025,0.013,0.016,0.019
8,oil_spill,0.03,0.023,0.021,0.031,0.036,0.04
9,pollution,0.0,0.026,0.014,0.024,0.028,0.028


In [114]:
get_avg_table('q')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.367,0.77,0.278,0.468,0.711,0.605
1,boston,0.315,0.782,0.383,0.422,0.613,0.539
2,cmc,0.085,0.837,0.207,0.471,0.726,0.533
3,flags,0.233,0.511,0.103,0.353,0.397,0.472
4,glass,0.297,0.667,0.298,0.259,0.554,0.516
5,ionosphere,0.381,0.853,0.226,0.437,0.64,0.583
6,isolet,0.0,0.773,0.086,0.56,0.409,0.481
7,lymph,0.301,0.555,0.304,0.258,0.388,0.414
8,oil_spill,0.601,0.909,0.269,0.481,0.813,0.763
9,pollution,0.0,0.334,0.244,0.187,0.255,0.479


In [115]:
get_avg_table('accuracy')

Unnamed: 0,index,Adaboost,Bagging,CatboostEnsemble,LightGBM,RandomForest,XGBoostEnsemble
0,SPECT,0.781,0.761,0.793,0.778,0.741,0.787
1,boston,0.85,0.851,0.857,0.846,0.841,0.849
2,cmc,0.623,0.637,0.637,0.641,0.616,0.655
3,flags,0.666,0.704,0.682,0.662,0.692,0.663
4,glass,0.83,0.789,0.816,0.813,0.814,0.801
5,ionosphere,0.911,0.904,0.909,0.913,0.919,0.901
6,isolet,0.968,0.975,0.978,0.98,0.979,0.977
7,lymph,0.881,0.844,0.841,0.838,0.848,0.873
8,oil_spill,0.936,0.941,0.943,0.942,0.95,0.934
9,pollution,0.681,0.786,0.793,0.75,0.779,0.756
