In [65]:
from utils import prepare_jupyter
prepare_jupyter()

import os
import pandas as pd
import numpy as np

CLFS = ['bagging', 'adaboost', 'randomforest', 'lgb', 'catboost', 'xgboost']
DATA_DIR_BASE = '/Users/tomek/University/MgrThesis/EnsembleDiversityResults'

COLUMNS = [
    # STRUCT.
    'node_diversity',    
    'used_attributes_ratio',
    # BEHAV.
    'corr',
    'df',
    'entropy',
    'kw',
    'q',
    'coverage_minmax',
    'coverage_std'
]

def get_data_path(which: str) -> str:
    path = os.path.join(DATA_DIR_BASE, which)
    if not os.path.exists(path):
        raise FileNotFoundError(f'{path} doesnt exist.')
    return path


In [11]:
!ls ../../EnsembleDiversityResults/experiments-10-08

adaboostexperiment-ensemble.csv     lgbexperiment-ensemble.csv
allexperiment-ensemble.csv          randomforestexperiment-ensemble.csv
baggingexperiment-ensemble.csv      xgboostexperiment-ensemble.csv
catboostexperiment-ensemble.csv


In [96]:
def save_table_csv(table_df, name, pretty=False):
    table_path = f'/Users/tomek/University/MgrThesis/EnsembleDiversityResults/tables-15-09/{name}.csv'
    table_to_save = table_df.copy()
    
    if pretty:
        index_name_map = {
            'dataset_name': 'Dataset'
        }
        
        column_name_map = {
            'used_attributes_ratio': 'Used attr. ratio',
            'node_diversity': 'Node diversity',
            'corr': 'Correlation',
            'entropy': 'Entropy',
            'kw': 'Kohavi-Wolpert variance',
            'q': 'Q-statistic',
            'coverage_minmax': 'Coverage (minmax)',
            'coverage_std': 'Coverage (std. dev.)'
        }
        
        table_to_save = table_to_save.rename(columns=column_name_map).rename(index=index_name_map)
        
    table_to_save.to_csv(table_path)
    
def save_table_html(table_df, name):
    table_path = f'/Users/tomek/University/MgrThesis/EnsembleDiversityResults/tables-15-09/{name}.csv'
    table_to_save = table_df.copy()
    table_to_save.to_html()

def get_table(df, ensemble_name):
    results = {}
    
    groups = df[df['name'] == ensemble_name].groupby(by=['dataset_name'])   
    
    for group_id, group_df in groups:
        dataset_name = group_id
        
        if dataset_name not in results:
            results[dataset_name] = []
        
        for column in COLUMNS:
            values = group_df[[column, 'accuracy']]
            corr_coeff = values.corr().iat[0, 1] # Or [1, 0] - doesn't matter
            results[dataset_name].append(corr_coeff)    
    
    table = np.array([[dataset_name, *results] for dataset_name, results in results.items()])
    table_df = pd.DataFrame(table, columns=['dataset_name', *COLUMNS]).set_index('dataset_name').astype(np.float32)
    
    return table_df.round(3)
    
path = get_data_path('experiments-10-08')
for clf in ['Bagging', 'Adaboost', 'RandomForest', 'LightGBM', 'XGBoostEnsemble', 'CatboostEnsemble']:
    print(clf)
    table = get_table(pd.read_csv(os.path.join(path, 'allexperiment-ensemble.csv')), clf)
    save_table_csv(table, clf, pretty=False)

Bagging


TypeError: save_table_csv() missing 2 required positional arguments: 'table_df' and 'name'

In [101]:
from IPython.core.display import display, HTML

base_path = get_data_path('experiments-10-08')
df_path = os.path.join(base_path, 'allexperiment-ensemble.csv')
df = get_table(pd.read_csv(df_path), 'Adaboost')
df

Unnamed: 0_level_0,node_diversity,used_attributes_ratio,corr,df,entropy,kw,q,coverage_minmax,coverage_std
dataset_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SPECT,-0.199,-0.225,-0.397,-0.008,0.368,0.159,-0.385,0.255,0.225
boston,0.365,0.239,0.209,-0.368,-0.311,0.133,0.248,-0.423,-0.448
cmc,-0.766,-0.549,0.032,-0.04,0.417,-0.161,-0.629,0.28,0.634
flags,0.244,0.423,0.371,0.169,-0.095,0.28,0.374,-0.27,-0.273
glass,-0.307,-0.37,-0.374,-0.894,-0.885,-0.613,-0.354,-0.934,-0.94
ionosphere,0.26,0.763,-0.396,0.382,0.705,0.68,-0.321,0.628,0.539
isolet,0.334,0.32,0.318,0.318,0.399,0.394,0.336,0.293,0.337
lymph,0.092,0.106,0.061,0.121,0.144,0.232,0.06,0.111,0.093
oil_spill,0.55,0.871,0.057,0.709,0.807,0.679,0.334,0.944,0.888
pollution,0.699,0.695,0.692,0.701,0.698,0.692,0.699,0.703,0.702


In [104]:
with open('test.tex', 'w') as fp:
    df.to_latex(fp)