In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
import re

# Convert a Series-object with type string to tuples
def string_to_tuple(input):
    output = input.strip('()')
    return  pd.Series(map(float, output.split(', ')))

# Expand each string in a series into multiple columns
def expand_col(df, col):
    if type(df.loc[0, col]) == str:
        col_expanded = df[col].apply(string_to_tuple)
    col_expanded.columns = [f'{col}_{i}' for i in range(len(col_expanded.columns))]
    df_new = pd.concat([df, col_expanded], axis=1)
    return df_new.drop(col, axis=1)

# Create dict that indexes unique values of a dataframe column
def index_unique_vals(df, col):
    return dict(zip(df[col].unique(), np.arange(df[col].nunique())))

def lineplot(data, ax, x, y, label=None):
    data.sort_values(by=x, inplace=True)
    ax.plot(data[x], data[y], label=label)

def scatterplot(data, ax, x, y, label=None, regression=True):
    x, y = data[x], data[y]
    x_expanded = np.expand_dims(x, -1)
    ax.scatter(x, y, label=label)
    
    if regression:
        reg = LinearRegression()
        reg.fit(x_expanded, y)
        r2 = r2_score(y, reg.predict(x_expanded))
        r2_str = '{0:.2f}'.format(r2)
        ax.plot(x, reg.predict(x_expanded), label= r'$R^2$ = {}'.format(r2_str))
        
def smooth(data, ax, x, y, label=None):
    spl = make_interp_spline(data[x], data[y], k=7) 
    x_new = np.linspace(0.05, 2, 200)
    y_new = spl(x_new)
    ax.plot(x_new, y_new, label=label)
    
def bar(data, ax, x, y, label=None):
    for idx, y_i in enumerate(y):
        bottom = data[y[idx-1]] if idx > 0 else None
        ax.bar(data[x], data[y_i], bottom=bottom, width=0.1, label=y_i.split('_')[1].upper())
    
def plot_by_group(data, x, y, vars_group, var_colors=None, plot_f=lineplot, figsize=(4.5, 3), xlabel=None, ylabel=None, scale=None, baseline=None, ftype='svg'):
    for group_params, data_plot in data.groupby(vars_group):
        fig, ax = plt.subplots(figsize=figsize)
        if var_colors:
            for label, data_color in data_plot.groupby(var_colors):
                plot_f(data_color, ax, x, y, label) 
        else:
            plot_f(data_plot, ax, x, y)
        ax.grid()

        if not xlabel:
            xlabel = x
        if not ylabel:
            ylabel = y
        if isinstance(baseline, dict):
            y_val = baseline[group_params]
            ax.axhline(y=y_val, label='Baseline', color='black', linestyle='--')

        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.legend()

        folder = f'plots/{y}_vs_{x}'
        if not os.path.exists(folder):
            os.makedirs(folder)
        fname = str(group_params).strip('()').replace(', ', '_') + x[0]

        if scale:
            plt.xscale(scale)
        plt.tight_layout()
        plt.savefig(f'{folder}/{fname}.{ftype}', dpi=300)
        plt.close(fig)

## Bar-Plots for training times

In [4]:
df_rt = pd.read_csv('../runtimes.csv')

In [41]:
plot_by_group(df_rt, x='hidden_dims', y=['mean_ae_fit_time', 'mean_clf_fit_time'], vars_group=['dataset_id', 'clf'], xlabel='Latent Dimension', ylabel='Fit Time (s)', plot_f=bar)

In [None]:
plot_by_group(df_rt, x='hidden_dims', y=['mean_ae_fit_time', 'mean_clf_fit_time'], vars_group=['dataset_id', 'clf'], xlabel='Latent Dimension', ylabel='Fit Time (s)', plot_f=bar, figsize='')

## Test weighting of latent loss in VAE

In [39]:
df = pd.read_csv('../vae_beta.csv')
plot_by_group(df, x='beta', y='mean_test_accuracy', vars_group='dataset_id', var_colors='sampling', xlabel=r'$\beta$', ylabel='Accuracy', scale='log')

FileNotFoundError: [Errno 2] No such file or directory: '../vae_beta.csv'

## Tests for variable latent dim

In [32]:
df = pd.read_csv('../csv/latent_dim.csv')
df_baseline = pd.read_csv('../csv/baseline.csv')
df_baseline.index = df_baseline.dataset_id
dict_baseline = df_baseline['mean_test_accuracy'].to_dict()

In [35]:
# Plot testing accuracy vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', xlabel='Compression Ratio', ylabel='Accuracy', baseline=dict_baseline)

In [36]:
# Plot reconstruction error vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_test_reconstruction_error', vars_group='dataset_id', var_colors='type', xlabel='Compression Ratio', ylabel='Reconstruction Error')

In [17]:
# Plot training time vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_fit_time', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Training Time')

KeyError: 'dataset_id'

In [48]:
# Plot inference time vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_score_time', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Inference Time')

In [49]:
# Plot testing accuracy vs. reconstruction error
plot_by_group(df, x='mean_test_reconstruction_error', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', plot_f=scatterplot, xlabel='Test Reconstruction Error', ylabel='Test Accuracy')

### Plot with PCA:

In [33]:
df_wpca = pd.read_csv('../csv/latent_dim_wpca.csv')
plot_by_group(df_wpca, x='hidden_dims', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', xlabel='Compression Ratio', ylabel='Accuracy', baseline=dict_baseline)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Plots for Presentation

In [18]:
plot_by_group(df, x='hidden_dims', y='mean_test_accuracy', baseline=dict_baseline, vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Accuracy', ftype='png', figsize=(8,5))
plot_by_group(df[df['type'] != 'VAE'], x='hidden_dims', y='mean_test_reconstruction_error', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Reconstruction Error', ftype='png', figsize=(8,5))

## Tests for variable network depth

In [42]:
df_d = pd.read_csv('../depth.csv')

In [44]:
plot_by_group(df_d, x='n_layers', y='mean_test_accuracy', vars_group=['dataset_id', 'type'], var_colors='activation', xlabel='# of hidden layers', ylabel='Accuracy')

## SSL Plots

In [45]:
df_ssl = pd.read_csv('../ssl.csv')

In [53]:
plot_by_group(df_ssl, x='labeled_data', y='mean_test_accuracy', vars_group='dataset_id', var_colors='transformer', xlabel='Labeled Data', ylabel='Accuracy')

## Plot Robustness

In [3]:
def get_type(s):
    if 'vae' in s:
        return 'VAE'
    elif 'sae' in s:
        return 'SAE'
    elif 'dae' in s:
        return 'DAE'
    elif 'Identity' in s:
        return 'None'
    else:
        return 'AE'

def preprocess(df):
    df['transformer'] = df['transformer'].apply(get_type)
    df['clf'] = df['clf'].apply(lambda x: x.split('(')[0])

In [42]:
df_te = pd.read_csv('../csv/robustness_testing.csv')

In [48]:
plot_by_group(df_te, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'noise_type', 'clf'], var_colors='transformer', xlabel='% Corruption', ylabel='Accuracy')

### Plot TPOT Robustness

In [28]:
df_tpot_all = pd.read_csv('../csv/tpot_w_wo_ae.csv')

In [29]:
df_tpot_all

Unnamed: 0.1,Unnamed: 0,clf,dataset_id,mean_test_accuracy,noise_level,noise_type,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,transformer
0,0,TPOT,44,0.950012,0.0,SnP,0.953716,0.942634,0.953686,
1,1,TPOT,44,0.950012,0.0,Gaussian,0.953716,0.942634,0.953686,
2,2,TPOT,44,0.921324,5.0,SnP,0.919166,0.913950,0.930855,
3,3,TPOT,44,0.848302,20.0,Gaussian,0.827901,0.829205,0.887802,
4,4,TPOT,44,0.897201,10.0,SnP,0.880052,0.891786,0.919765,
...,...,...,...,...,...,...,...,...,...,...
83,83,TPOT,44,0.829169,160.0,Gaussian,0.832464,0.819426,0.835616,DAE
84,84,TPOT,44,0.681796,45.0,SnP,0.721643,0.698175,0.625571,DAE
85,85,TPOT,44,0.823519,180.0,Gaussian,0.833768,0.803129,0.833659,DAE
86,86,TPOT,44,0.670058,50.0,SnP,0.716428,0.685137,0.608611,DAE


In [30]:
plot_by_group(df_tpot_all, x='noise_level', y='mean_test_accuracy', vars_group=['dataset_id', 'noise_type'], var_colors='transformer', xlabel='% Corruption', ylabel='Accuracy')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
