In [61]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
import re
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 12})
rc('text', usetex=True)

# Convert a Series-object with type string to tuples
def string_to_tuple(input):
    output = input.strip('()')
    return  pd.Series(map(float, output.split(', ')))

# Expand each string in a series into multiple columns
def expand_col(df, col):
    if type(df.loc[0, col]) == str:
        col_expanded = df[col].apply(string_to_tuple)
    col_expanded.columns = [f'{col}_{i}' for i in range(len(col_expanded.columns))]
    df_new = pd.concat([df, col_expanded], axis=1)
    return df_new.drop(col, axis=1)

# Create dict that indexes unique values of a dataframe column
def index_unique_vals(df, col):
    return dict(zip(df[col].unique(), np.arange(df[col].nunique())))

def lineplot(data, ax, x, y, label=None):
    data.sort_values(by=x, inplace=True)
    ax.plot(data[x], data[y], label=label)

def scatterplot(data, ax, x, y, label=None, regression=True):
    x, y = data[x], data[y]
    x_expanded = np.expand_dims(x, -1)
    ax.scatter(x, y, label=label)
    
    if regression:
        reg = LinearRegression()
        reg.fit(x_expanded, y)
        r2 = r2_score(y, reg.predict(x_expanded))
        r2_str = '{0:.2f}'.format(r2)
        ax.plot(x, reg.predict(x_expanded), label= r'$R^2$ = {}'.format(r2_str))
        
def smooth(data, ax, x, y, label=None):
    spl = make_interp_spline(data[x], data[y], k=7) 
    x_new = np.linspace(0.05, 2, 200)
    y_new = spl(x_new)
    ax.plot(x_new, y_new, label=label)
    
def bar(data, ax, x, y, label=None):
    for idx, y_i in enumerate(y):
        data.sort_values(by=x, inplace=True)
        bottom = data[y[idx-1]].to_numpy() if idx > 0 else None
        ax.bar(x=data[x].to_numpy(), height=data[y_i].to_numpy(), bottom=bottom, width=10, label=y_i.split('_')[-1].upper())
    
def plot_by_group(data, x, y, vars_group, var_colors=None, plot_f=lineplot, figsize=(4.5, 3), xlabel=None, ylabel=None, scale=None, baseline=None, ftype='svg'):
    for group_params, data_plot in data.groupby(vars_group):
        fig, ax = plt.subplots(figsize=figsize)
        if var_colors:
            for label, data_color in data_plot.groupby(var_colors):
                plot_f(data_color, ax, x, y, label) 
        else:
            plot_f(data_plot, ax, x, y)
        ax.grid()

        if not xlabel:
            xlabel = x
        if not ylabel:
            ylabel = y
        if isinstance(baseline, dict):
            y_val = baseline[group_params]
            ax.axhline(y=y_val, label='Baseline', color='black', linestyle='--')

        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.legend()

        folder = f'plots/{y}_vs_{x}'
        if not os.path.exists(folder):
            os.makedirs(folder)
        fname = str(group_params).strip('()').replace(', ', '_') + x[0]

        if scale:
            plt.xscale(scale)
        plt.tight_layout()
        plt.savefig(f'{folder}/{fname}.{ftype}', dpi=300)
        plt.close(fig)

## Plots: [Metric] vs. compression

In [8]:
df = pd.read_csv('csv/accuracy_vs_compression.csv')
df_baseline = pd.read_csv('csv/accuracy_no_encoder.csv')
df_baseline.index = df_baseline.dataset_id
dict_baseline = df_baseline['mean_test_accuracy'].to_dict()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,dataset_id,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_accuracy,hidden_dims,type
0,0,0,40996,24.806013,1.946357,0.139366,0.007231,0.831271,0.05,PCA
1,1,1,40996,35.934472,0.528053,0.148693,0.003383,0.846971,0.10,PCA
2,2,2,40996,37.455692,0.285998,0.135313,0.026119,0.852200,0.15,PCA
3,3,3,40996,38.582446,0.441768,0.167858,0.000404,0.854057,0.20,PCA
4,4,4,40996,40.641660,0.448354,0.162723,0.001400,0.853500,0.25,PCA
...,...,...,...,...,...,...,...,...,...,...
1035,635,155,44,8.703304,0.142838,0.010182,0.003382,0.909588,1.80,VAE
1036,636,156,44,6.649810,0.764583,0.007433,0.000636,0.906326,1.85,VAE
1037,637,157,44,7.658982,1.605296,0.007608,0.000953,0.911975,1.90,VAE
1038,638,158,44,8.388769,1.489005,0.018548,0.017608,0.913064,1.95,VAE


In [13]:
# Plot testing accuracy
plot_by_group(df, x='compression', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', xlabel='Compression [\%]', ylabel='Accuracy', baseline=dict_baseline)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
findfont: Font family ['serif'] not found. Falling back to DejaVu Sans.


In [62]:
# Plot training times for different classifiers
df_rt = pd.read_csv('csv/runtime_vs_compression.csv')
plot_by_group(df_rt, x='compression', y=['mean_fit_time_ae', 'mean_fit_time_clf'], vars_group=['dataset_id', 'clf'], xlabel='Compression [\%]', ylabel='Fit Time (s)', plot_f=bar)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [84]:
# Plot reconstruction error vs. compression
df_rec = pd.read_csv('csv/reconstruction_error_vs_compression.csv')
plot_by_group(df_rec, x='compression', y='mean_test_reconstruction_error', vars_group='dataset_id', var_colors='type', xlabel='Compression [\%]', ylabel='Reconstruction Error')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [48]:
# Plot inference time vs. compression
plot_by_group(df, x='compression', y='mean_score_time', vars_group='dataset_id', var_colors='type', xlabel='Compression [\%]', ylabel='Inference Time')

In [49]:
# Plot testing accuracy vs. reconstruction error
plot_by_group(df, x='mean_test_reconstruction_error', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', plot_f=scatterplot, xlabel='Test Reconstruction Error', ylabel='Test Accuracy')

### Plots for Presentation

In [18]:
plot_by_group(df, x='hidden_dims', y='mean_test_accuracy', baseline=dict_baseline, vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Accuracy', ftype='png', figsize=(8,5))
plot_by_group(df[df['type'] != 'VAE'], x='hidden_dims', y='mean_test_reconstruction_error', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Reconstruction Error', ftype='png', figsize=(8,5))

## Tests for variable network depth

In [10]:
df_d = pd.read_csv('csv/depth.csv')

Unnamed: 0,dataset_id,mean_fit_time,std_fit_time,mean_score_time,std_score_time,activation,n_layers,type,params,split0_test_accuracy,...,split2_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_test_reconstruction_error,split1_test_reconstruction_error,split2_test_reconstruction_error,mean_test_reconstruction_error,std_test_reconstruction_error,rank_test_reconstruction_error
0,40996,277.144556,31.530760,0.773189,0.010745,SELU,1,ae,"{'ae__activation': 'selu', 'ae__n_layers': 1, ...",0.866718,...,0.870655,0.868843,0.001622,44,0.258833,0.259524,0.259324,0.259227,0.000290,65
1,40996,133.690559,19.950555,0.697017,0.004993,SELU,1,vae,"{'ae__activation': 'selu', 'ae__n_layers': 1, ...",0.855318,...,0.854412,0.853586,0.001847,60,0.293928,0.294088,0.293280,0.293765,0.000349,17
2,40996,222.651826,42.597901,0.690339,0.005188,SELU,1,dae,"{'ae__activation': 'selu', 'ae__n_layers': 1, ...",0.873661,...,0.874770,0.874457,0.000567,25,0.263685,0.264642,0.264687,0.264338,0.000462,56
3,40996,291.483475,37.470845,0.709755,0.017324,SELU,1,sae,"{'ae__activation': 'selu', 'ae__n_layers': 1, ...",0.866761,...,0.871598,0.868914,0.002010,43,0.254613,0.255527,0.255861,0.255334,0.000528,77
4,40996,150.914428,26.367808,0.697845,0.012661,SELU,2,ae,"{'ae__activation': 'selu', 'ae__n_layers': 2, ...",0.869889,...,0.872027,0.871529,0.001188,34,0.256969,0.256273,0.255555,0.256266,0.000577,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,44,8.063330,1.203862,0.012793,0.001114,Sigmoid,4,sae,"{'ae__activation': 'sigmoid', 'ae__n_layers': ...",0.712516,...,0.831703,0.744640,0.062263,71,0.074668,0.075335,0.074352,0.074785,0.000410,26
316,44,7.361194,0.425325,0.011443,0.002116,Sigmoid,5,ae,"{'ae__activation': 'sigmoid', 'ae__n_layers': ...",0.584094,...,0.592955,0.589003,0.003680,80,0.074676,0.075307,0.074364,0.074782,0.000392,27
317,44,8.954783,0.204199,0.017500,0.004005,Sigmoid,5,vae,"{'ae__activation': 'sigmoid', 'ae__n_layers': ...",0.605606,...,0.606001,0.605955,0.000268,75,0.074672,0.075329,0.074366,0.074789,0.000402,20
318,44,7.601244,1.899434,0.043610,0.047276,Sigmoid,5,dae,"{'ae__activation': 'sigmoid', 'ae__n_layers': ...",0.605606,...,0.606001,0.605955,0.000268,75,0.074688,0.075343,0.074346,0.074792,0.000414,18


In [13]:
plot_by_group(df_d, x='n_layers', y='mean_test_accuracy', vars_group=['dataset_id', 'type'], var_colors='activation', xlabel='Hidden layers', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## SSL Plots

In [45]:
df_ssl = pd.read_csv('../ssl.csv')

In [53]:
plot_by_group(df_ssl, x='labeled_data', y='mean_test_accuracy', vars_group='dataset_id', var_colors='transformer', xlabel='Labeled Data', ylabel='Accuracy')

## Plot Robustness

In [3]:
def get_type(s):
    if 'vae' in s:
        return 'VAE'
    elif 'sae' in s:
        return 'SAE'
    elif 'dae' in s:
        return 'DAE'
    elif 'Identity' in s:
        return 'None'
    else:
        return 'AE'

def preprocess(df):
    df['transformer'] = df['transformer'].apply(get_type)
    df['clf'] = df['clf'].apply(lambda x: x.split('(')[0])

In [85]:
df_te = pd.read_csv('csv/robustness_testing_all.csv')

In [94]:
# Plot robustness to corruption of testing data
df_ae = df_te[(df_te.transformer != 'CSAE') & (df_te.transformer != 'CVAE') & (df_te.transformer != 'CAE') & (df_te.transformer != 'CDAE')]
plot_by_group(df_ae, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'noise_type', 'clf'], var_colors='transformer', xlabel='Level of Corruption [\%]', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [99]:
df_tr = pd.read_csv('csv/robustness_training_snp.csv')
plot_by_group(df_tr, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'clf'], var_colors='transformer', xlabel='Level of Corruption [\%]', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Robustness of CAE

In [88]:
[('cae', 'ae'), ('cvae', 'vae'), ('csae', 'sae'), ('cdae', 'dae')]
df_i = df_te[(df_te.transformer == 'CVAE') | (df_te.transformer == 'VAE') | (df_te.transformer == 'None')]
plot_by_group(df_i, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'noise_type', 'clf'], var_colors='transformer', xlabel='Level of Corruption [\%]', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Plot TPOT Robusuniques with and without autoencoder

In [92]:
df_tpot_all = pd.read_csv('csv/robustness_testing_tpot_w_wo_ae.csv')

In [95]:
plot_by_group(df_tpot_all, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'noise_type'], var_colors='transformer', xlabel='Level of Corruption [\%]', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
