In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import os
import re

In [3]:
# Convert a Series-object with type string to tuples
def string_to_tuple(input):
    output = input.strip('()')
    return  pd.Series(map(float, output.split(', ')))

# Expand each string in a series into multiple columns
def expand_col(df, col):
    if type(df.loc[0, col]) == str:
        col_expanded = df[col].apply(string_to_tuple)
    col_expanded.columns = [f'{col}_{i}' for i in range(len(col_expanded.columns))]
    df_new = pd.concat([df, col_expanded], axis=1)
    return df_new.drop(col, axis=1)

# Create dict that indexes unique values of a dataframe column
def index_unique_vals(df, col):
    return dict(zip(df[col].unique(), np.arange(df[col].nunique())))

In [4]:
def lineplot(data, ax, x, y, label=None):
    data.sort_values(by=x, inplace=True)
    ax.plot(data[x], data[y], label=label)

def scatterplot(data, ax, x, y, label=None, regression=True):
    x, y = data[x], data[y]
    x_expanded = np.expand_dims(x, -1)
    ax.scatter(x, y, label=label)
    
    if regression:
        reg = LinearRegression()
        reg.fit(x_expanded, y)
        r2 = r2_score(y, reg.predict(x_expanded))
        r2_str = '{0:.2f}'.format(r2)
        ax.plot(x, reg.predict(x_expanded), label= r'$R^2$ = {}'.format(r2_str))
        
def smooth(data, ax, x, y, label=None):
    spl = make_interp_spline(data[x], data[y], k=7) 
    x_new = np.linspace(0.05, 2, 200)
    y_new = spl(x_new)
    ax.plot(x_new, y_new, label=label)

In [22]:
def plot_by_group(data, x, y, vars_group, var_colors=None, plot_f=lineplot, figsize=(4.5, 3), xlabel=None, ylabel=None, scale=None):
    for group_params, data_plot in data.groupby(vars_group):
        fig, ax = plt.subplots(figsize=figsize)
        if var_colors:
            for label, data_color in data_plot.groupby(var_colors):
                plot_f(data_color, ax, x, y, label)
            ax.legend()
        else:
            plot_f(data_plot, ax, x, y)
            
        ax.grid()
        
        if not xlabel:
            xlabel = x
        if not ylabel:
            ylabel = y
        
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        
        folder = f'plots/{y}_vs_{x}'
        if not os.path.exists(folder):
            os.makedirs(folder)
        fname = str(group_params).strip('()').replace(', ', '_')
        
        if scale:
            plt.xscale(scale)
        plt.tight_layout()
        plt.savefig(f'{folder}/{fname}.svg')
        plt.close(fig)

## Test weighting of latent loss in VAE

In [23]:
df = pd.read_csv('../vae_beta.csv')
plot_by_group(df, x='beta', y='mean_test_accuracy', vars_group='dataset_id', var_colors='sampling',xlabel=r'$\beta$', ylabel='Test Accuracy', scale='log')

## Tests for variable latent dim

In [6]:
df = pd.read_csv('../latent_dim.csv')
df['type'] = df['type'].apply(lambda x: x.upper())
df.drop('params', axis=1, inplace=True)
df.fillna(0, inplace=True)
df['mean_test_accuracy'] = df[[f'split{i}_test_accuracy' for i in range(3)]].aggregate('mean', axis=1)

In [45]:
# Plot testing accuracy vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Test Accuracy')

In [8]:
# Plot training time vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_fit_time', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Training Time')

In [None]:
# Plot inference time vs. latent dim
plot_by_group(df, x='hidden_dims', y='mean_score_time', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Inference Time')

In [10]:
# Plot reconstruction error vs. latent dim
df_clf_averages = df.groupby(['dataset_id', 'hidden_dims']).aggregate({'mean_test_reconstruction_error': 'mean'})
df_clf_averages.reset_index(inplace=True)
plot_by_group(df, x='hidden_dims', y='mean_test_reconstruction_error', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimension', ylabel='Test Reconstruction Error')

In [71]:
# Plot testing accuracy vs. reconstruction error
plot_by_group(df, x='mean_test_reconstruction_error', y='mean_test_accuracy', vars_group='dataset_id', var_colors='clf', plot_f=scatterplot, xlabel='Test Reconstruction Error', ylabel='Test Accuracy')

## Tests for variable network depth

In [6]:
df_d = pd.read_csv('gridsearchcv_results_010.csv')
df_d['activation'].fillna('linear', inplace=True)

In [43]:
plot_by_group(df_d, x='n_layers', y='mean_test_accuracy', vars_group=['dataset_id', 'clf'], var_colors='activation', xlabel='# of hidden layers', ylabel='Test Accuracy')

## Add baselines 

In [33]:
df_baselines = pd.read_csv('gridsearchcv_results_011.csv')
df_baselines['n_layers'] = 0
activations = df_d.activation.unique()
baselines_dict = {activation: df_baselines for activation in activations}
df_baselines_expanded = pd.concat(baselines_dict)
df_baselines_expanded.index = df_baselines_expanded.index.set_names(['activation', None])
df_baselines_expanded.reset_index(level=0, inplace=True)
df_baselines_expanded.reset_index(inplace=True, drop=True)
df_d_wbaselines = pd.concat([df_baselines_expanded, df_d])

Unnamed: 0,activation,dataset_id,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,mean_test_accuracy,...,rank_test_accuracy,clf,n_layers,hidden_dims,split0_test_reconstruction_error,split1_test_reconstruction_error,split2_test_reconstruction_error,mean_test_reconstruction_error,std_test_reconstruction_error,rank_test_reconstruction_error
0,linear,40996,232.430733,18.668728,256.604804,6.405878,0.778906,0.763468,0.776240,0.772871,...,3,svm_rbf,0,,,,,,,
1,linear,40996,62.133320,0.568851,0.036274,0.001740,0.852019,0.854498,0.846655,0.851057,...,2,log_reg,0,,,,,,,
2,linear,40996,968.237081,16.391548,1.780693,0.462679,0.900789,0.905242,0.897570,0.901200,...,1,xgb,0,,,,,,,
3,linear,40668,12.693643,0.448393,6.994535,0.257711,0.685599,0.427284,0.567787,0.560223,...,3,svm_rbf,0,,,,,,,
4,linear,40668,5.883384,0.221668,0.004485,0.000176,0.734358,0.707580,0.724100,0.722013,...,1,log_reg,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,sigmoid,44,2.141612,0.294542,0.010668,0.000288,0.605606,0.606258,0.606001,0.605955,...,66,log_reg,4,0.4,0.003015,0.003881,0.002781,0.003226,0.000473,3.0
296,sigmoid,44,2.388092,0.144338,0.007318,0.000170,0.757497,0.819426,0.746249,0.774391,...,60,xgb,4,0.4,0.003016,0.003881,0.002780,0.003226,0.000473,4.0
297,sigmoid,44,2.412390,0.244413,0.059005,0.000413,0.597132,0.627771,0.606001,0.610301,...,65,svm_rbf,5,0.4,0.003021,0.003880,0.002775,0.003225,0.000474,6.0
298,sigmoid,44,1.918400,0.276939,0.011987,0.000183,0.605606,0.606258,0.606001,0.605955,...,66,log_reg,5,0.4,0.003017,0.003885,0.002788,0.003230,0.000473,2.0


In [44]:
plot_by_group(df_d_wbaselines, x='n_layers', y='mean_test_accuracy', vars_group=['dataset_id', 'clf'], var_colors='activation', xlabel='# of hidden layers', ylabel='Test Accuracy')

In [41]:
df_d_clf_averages = df_d.groupby(['dataset_id', 'n_layers', 'activation']).aggregate({'mean_test_reconstruction_error': 'mean'})
df_d_clf_averages.reset_index(inplace=True)
plot_by_group(df_d_clf_averages, x='n_layers', y='mean_test_reconstruction_error', var_colors='activation', vars_group='dataset_id', xlabel='Latent Dimension', ylabel='Test Reconstruction Error')

In [8]:
plot_by_group(df_d, x='mean_test_reconstruction_error', y='mean_test_accuracy', plot_f=scatterplot, var_colors='clf', vars_group=['dataset_id', 'activation'], xlabel='Test Reconstruction Error', ylabel='Test Accuracy')

## Plot Testresults of AE-Variants with Variable Latent Dim

In [48]:
df = pd.read_csv('../gridsearchcv_results_026.csv')
df.columns = [re.sub("param.*__", "", col) for col in df.columns]
df.drop('params', axis=1, inplace=True)

In [49]:
plot_by_group(df, x='hidden_dims', y='mean_test_accuracy', vars_group='dataset_id', var_colors='type', xlabel='Latent Dimensionality', ylabel='Accuracy')

## Playground

In [9]:
df = pd.read_csv('../sag_lbfgs_max_iter.csv')

In [10]:
plot_by_group(df, x='param_max_iter', y='mean_test_score', vars_group='dataset_id', var_colors='param_solver')