In [8]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from himalaya.backend import set_backend
from matplotlib import pyplot as plt

from compare_variance_residual.residual import residual_method
from compare_variance_residual.simulation import generate_dataset, save_scores
from compare_variance_residual.variance_partitioning import variance_partitioning

In [9]:
def get_path(alphas, cv, n_targets):
    path = os.path.join("results", f"targets={n_targets}", f"cv={cv}",
                        f"alphas={alphas.min()},{alphas.max()},{len(alphas)}", "varying dimensions")
    os.makedirs(path, exist_ok=True)
    return path

# Save scores for varying Dimensions

In [10]:
backend = set_backend("cupy", on_error="warn")
warnings.filterwarnings("ignore")
random.seed(42)

In [11]:
d_list = [100, 100, 100]
n_targets = 10000
n_samples_train = 10000
n_samples_test = 100
n_samples = n_samples_train + n_samples_test
noise_target = 0.1
scalars = [1 / 3, 1 / 3, 1 / 3]

cv = 10
alphas = np.logspace(-5, 5, 10)
path = get_path(alphas, cv, n_targets)

In [12]:
varying_dim = np.logspace(1, 3, 10, dtype=int)
varying_dim = list(map(int, varying_dim))
varying_dim

[10, 16, 27, 46, 77, 129, 215, 359, 599, 1000]

## Shared dimension

In [13]:
d_list_list = [[int(dim), d_list[1], d_list[2]] for dim in varying_dim]

In [14]:
save_scores(path, d_list_list, scalars, n_targets, n_samples, noise_target, cv, alphas)

TypeError: save_scores() missing 1 required positional argument: 'alphas'

## Unique dimension

In [9]:
d_list_list = [[d_list[0], int(dim), d_list[2]] for dim in varying_dim]

In [10]:
save_scores(d_list_list, d_list, n_targets, n_samples, noise_target, cv, alphas)

[100, 10, 100]
skipping, already exists
[100, 16, 100]
skipping, already exists
[100, 27, 100]
skipping, already exists
[100, 46, 100]
skipping, already exists
[100, 77, 100]
data generated
variance partitioning done
residual method done
[100, 129, 100]
data generated
variance partitioning done
residual method done
[100, 215, 100]
data generated
variance partitioning done
residual method done
[100, 359, 100]
data generated
variance partitioning done
residual method done
[100, 599, 100]
data generated
variance partitioning done
residual method done
[100, 1000, 100]
data generated
variance partitioning done
residual method done


# Plot scores

## Shared dimension

In [11]:
vp = pd.DataFrame()
rm = pd.DataFrame()

for i, dim in enumerate(varying_dim):
    shared_dims = [int(dim), d_list[1], d_list[2]]
    scores = pd.read_csv(os.path.join(get_path(alphas, cv, n_targets), f"scores_{shared_dims}.csv"))

    vp_scores = pd.DataFrame({
        'shared_dim': dim,
        'vp_x1_unique_score': scores['vp_x1_unique_score'],
    }, index=[i])

    rm_scores = pd.DataFrame({
        'shared_dim': dim,
        'rm_x1_unique_score': scores['rm_x1_unique_score'],
    }, index=[i])

    vp = pd.concat([vp, vp_scores], ignore_index=True)
    rm = pd.concat([rm, rm_scores], ignore_index=True)
vp.head()
rm.head()

FileNotFoundError: [Errno 2] No such file or directory: 'results/targets=10000/cv=10/alphas=1e-05,100000.0,10/varying dimensions/scores_[np.int64(10), 100, 100].csv'

In [None]:
sns.lineplot(data=vp, x='shared_dim', y='vp_x1_unique_score', label='Variance Partitioning', palette='C2')
plt.fill_between(vp['shared_dim'], vp['vp_x1_lower_se'], vp['vp_x1_upper_se'], alpha=0.3)
sns.lineplot(data=rm, x='shared_dim', y='rm_x1_unique_score', label='Residual Method')
plt.fill_between(rm['shared_dim'], rm['rm_x1_lower_se'], rm['rm_x1_upper_se'], alpha=0.3)
plt.axhline(scalars[1], linestyle='--', label='true unique variance')
plt.xlabel(r"$d_\mathbf{A}$")
plt.ylabel(r"Predicted Unique Variance (avg. $R^2$)")
plt.xscale('log')

## Unique dimension

In [None]:
vp = pd.DataFrame()
rm = pd.DataFrame()

for i, dim in enumerate(varying_dim):
    unique_dims = [d_list[0], dim, d_list[2]]
    scores_unique = pd.read_csv(os.path.join(get_path(alphas, cv, n_targets), f"scores_{unique_dims}.csv"))

    vp_scores = pd.DataFrame({
        'unique_dim': dim,
        'vp_x1_unique_score': scores_unique['vp_x1_unique_score'],
    }, index=[i])

    rm_scores = pd.DataFrame({
        'unique_dim': dim,
        'rm_x1_unique_score': scores_unique['rm_x1_unique_score'],
    }, index=[i])

    vp = pd.concat([vp, vp_scores], ignore_index=True)
    rm = pd.concat([rm, rm_scores], ignore_index=True)

In [None]:
sns.lineplot(data=vp, x='unique_dim', y='vp_x1_unique_score', label='Variance Partitioning')
plt.fill_between(vp['unique_dim'], vp['vp_x1_lower_se'], vp['vp_x1_upper_se'], alpha=0.3)
sns.lineplot(data=rm, x='unique_dim', y='rm_x1_unique_score', label='Residual Method')
plt.fill_between(rm['unique_dim'], rm['rm_x1_lower_se'], rm['rm_x1_upper_se'], alpha=0.3)
plt.axhline(scalars[1], linestyle='--', label='true unique variance')
plt.xlabel(r"$d_\mathbf{B}$")
plt.ylabel(r"Predicted Unique Variance (avg. $R^2$)")
plt.xscale('log')