In [1]:
import os.path
import random
import warnings

import numpy as np
import pandas as pd
from himalaya.backend import set_backend

from compare_variance_residual.plotting import plot_variance_partitioning_results, plot_residual_method_results, plot_variance_vs_residual_joint
from compare_variance_residual.residual import residual_method
from compare_variance_residual.simulation import generate_dataset
from compare_variance_residual.variance_partitioning import variance_partitioning

In [2]:
backend = set_backend("cupy", on_error="warn")
warnings.filterwarnings("ignore")
random.seed(42)

In [3]:
n_targets = 10000
n_samples_train = 10000
n_samples_test = 1000
n_samples = n_samples_train + n_samples_test
noise_scalar = 0.1

cv = 20
alphas = np.logspace(-4, 4, 10)

In [4]:
path = os.path.join("results", f"targets={n_targets}", f"samples={n_samples}", f"noise={noise_scalar}", f"cv={cv}",
                    f"alphas={alphas.min()},{alphas.max()},{len(alphas)}", "varying scalars")

In [5]:
OVERWRITE = False

# Testing proportions of contribution

In [6]:
d_list = [
    100,  # shared
    100,  # unique 0
    100,  # unique 1
]
varying_scalar = np.linspace(0, 1, 10)
other_scalars = np.linspace(1, 0, 10) / 2

## Test different $a_\mathbf{A}$

In [7]:
experiment_path = os.path.join(path, "shared contribution")
os.makedirs(experiment_path, exist_ok=True)
scalars_list = [[varying, other, other] for varying, other in zip(varying_scalar, other_scalars)]
scalars_list

[[np.float64(0.0), np.float64(0.5), np.float64(0.5)],
 [np.float64(0.1111111111111111),
  np.float64(0.4444444444444444),
  np.float64(0.4444444444444444)],
 [np.float64(0.2222222222222222),
  np.float64(0.3888888888888889),
  np.float64(0.3888888888888889)],
 [np.float64(0.3333333333333333),
  np.float64(0.33333333333333337),
  np.float64(0.33333333333333337)],
 [np.float64(0.4444444444444444),
  np.float64(0.2777777777777778),
  np.float64(0.2777777777777778)],
 [np.float64(0.5555555555555556),
  np.float64(0.2222222222222222),
  np.float64(0.2222222222222222)],
 [np.float64(0.6666666666666666),
  np.float64(0.16666666666666669),
  np.float64(0.16666666666666669)],
 [np.float64(0.7777777777777777),
  np.float64(0.11111111111111116),
  np.float64(0.11111111111111116)],
 [np.float64(0.8888888888888888),
  np.float64(0.05555555555555558),
  np.float64(0.05555555555555558)],
 [np.float64(1.0), np.float64(0.0), np.float64(0.0)]]

In [None]:
for scalars in scalars_list:
    print(scalars)
    csv_path = os.path.join(experiment_path, f"scores_{scalars}.csv")
    if not OVERWRITE and os.path.exists(csv_path):
        print("skipping, already exists")
        continue
    Xs, Y = generate_dataset(d_list, scalars, n_targets, n_samples, noise_scalar)
    x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score = variance_partitioning(
        Xs, Y, n_samples_train, alphas, cv)
    print("variance partitioning done")
    _, _, x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score = residual_method(
        Xs, Y, n_samples_train, alphas, cv)
    print("residual method done")
    scores = pd.DataFrame()
    scores[r"$f(X_1\cup X_2)\approx Y$"] = joint_score
    scores[r"$g(X_1)\approx Y$"] = x1_score
    scores[r"$h(X_2)\approx Y$"] = x2_score
    scores[r"$i(X_1)\cap h(X_2)$"] = x1_and_x2_score
    scores[r"$j(X_1) \setminus h(X_2)$"] = vp_x1_unique_score
    scores[r"$k(X_2) \setminus g(X_1)$"] = vp_x2_unique_score

    scores[r"$l(X_2)\approx X_1$"] = np.concatenate(
        [x2_to_x1_score, np.full(len(x1_score) - len(x2_to_x1_score), np.nan)])
    scores[r"$m(X_1)\approx X_2$"] = np.concatenate(
        [x1_to_x2_score, np.full(len(x2_score) - len(x1_to_x2_score), np.nan)])
    scores[r"$n(X_1 - l(X_2)) \approx Y$"] = rm_x1_unique_score
    scores[r"$o(X_2 - m(X_1)) \approx Y$"] = rm_x2_unique_score
    print(scores.head())

    scores.to_csv(csv_path, index=False)

[np.float64(0.0), np.float64(0.5), np.float64(0.5)]
skipping, already exists
[np.float64(0.1111111111111111), np.float64(0.4444444444444444), np.float64(0.4444444444444444)]


In [None]:
for scalars in scalars_list:
    scores_path = os.path.join(experiment_path, f"scores_{scalars}.csv")
    scores = pd.read_csv(scores_path)
    # take first 6 columns to get variance partitioning scores
    vp_scores = scores.iloc[:, :6]
    plot_variance_partitioning_results(scalars, vp_scores)
    # take full scores and last 4 columns to get residual scores
    rm_scores = scores.iloc[:, 1:3]
    rm_scores = pd.concat([rm_scores, scores.iloc[:, 6:]])
    print(rm_scores.head())
    plot_residual_method_results(scalars, d_list, rm_scores)

    vp_x1_unique_error = scores.iloc[:, 4] - scalars[1]
    vp_x2_unique_error = scores.iloc[:, 5] - scalars[2]
    residual_x1_unique_error = scores.iloc[:, 8] - scalars[1]
    residual_x2_unique_error = scores.iloc[:, 9] - scalars[2]

    # Columns: voxel index, error, method, feature space
    voxel_index = np.arange(len(vp_x1_unique_error))
    error = pd.DataFrame({
        "Voxel Index": np.concatenate(
            [voxel_index, voxel_index]),
        "VP Error": np.concatenate(
            [vp_x1_unique_error, vp_x2_unique_error]),
        "Residual Error": np.concatenate([residual_x1_unique_error, residual_x2_unique_error]),
        "Feature Space": [r"$X_1$"] * len(vp_x1_unique_error) + [r"$X_2$"] * len(vp_x2_unique_error)
    })

    plot_variance_vs_residual_joint(scalars, error)

## Test different $a_\mathbf{B}$

In [None]:
scalars_list = [[other, varying, other] for varying, other in zip(varying_scalar, other_scalars)]
scalars_list

In [None]:
for scalars in scalars_list:
    print(scalars)
    csv_path = os.path.join(experiment_path, f"scores_{scalars}.csv")
    if not OVERWRITE and os.path.exists(csv_path):
        print("skipping, already exists")
        continue
    Xs, Y = generate_dataset(d_list, scalars, n_targets, n_samples, noise_scalar)
    x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score = variance_partitioning(
        Xs, Y, n_samples_train, alphas, cv)
    print("variance partitioning done")
    _, _, x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score = residual_method(
        Xs, Y, n_samples_train, alphas, cv)
    print("residual method done")
    scores = pd.DataFrame()
    scores[r"$f(X_1\cup X_2)\approx Y$"] = joint_score
    scores[r"$g(X_1)\approx Y$"] = x1_score
    scores[r"$h(X_2)\approx Y$"] = x2_score
    scores[r"$i(X_1)\cap h(X_2)$"] = x1_and_x2_score
    scores[r"$j(X_1) \setminus h(X_2)$"] = vp_x1_unique_score
    scores[r"$k(X_2) \setminus g(X_1)$"] = vp_x2_unique_score

    scores[r"$l(X_2)\approx X_1$"] = np.concatenate(
        [x2_to_x1_score, np.full(len(x1_score) - len(x2_to_x1_score), np.nan)])
    scores[r"$m(X_1)\approx X_2$"] = np.concatenate(
        [x1_to_x2_score, np.full(len(x2_score) - len(x1_to_x2_score), np.nan)])
    scores[r"$n(X_1 - l(X_2)) \approx Y$"] = rm_x1_unique_score
    scores[r"$o(X_2 - m(X_1)) \approx Y$"] = rm_x2_unique_score
    print(scores.head())

    scores.to_csv(csv_path, index=False)