In [9]:
import os
import random
import warnings

import matplotlib.pyplot as plt
import simplstyles
import numpy as np
import pandas as pd
import seaborn as sns
from himalaya.backend import set_backend

from compare_variance_residual.simulation.residual_method import residual_method
from compare_variance_residual.simulation.dataset import generate_dataset
from compare_variance_residual.simulation.variance_partitioning import variance_partitioning

In [10]:
plt.style.use('nord-light-talk')

In [11]:
def get_path():
    path = os.path.join("results", "varying noise")
    os.makedirs(path, exist_ok=True)
    return path

In [12]:
def get_experiments():
    path = get_path()
    experiments = []
    for file in os.listdir(path):
        if file.endswith(".csv"):
            variable = file.split("_")[1].split(".csv")[0]
            # parse variable
            noise = float(variable)
            experiments.append(noise)
    return experiments

In [13]:
def save_scores(noise_levels, n_samples=20000, n_samples_train=10000):
    path = get_path()
    for noise_target in noise_levels:
        print(noise_target)
        csv_path = os.path.join(path, f"scores_{noise_target}.csv")
        scores = pd.DataFrame()
        if os.path.exists(csv_path):
            print("skipping, already exists")
            continue
        Xs, Y = generate_dataset(n_samples=n_samples, noise_target=noise_target)
        print("data generated")
        x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score = variance_partitioning(
            Xs, Y, n_samples_train)
        print("variance partitioning done")

        scores["x1_score"] = x1_score
        scores["x2_score"] = x2_score
        scores["vp_joint_score"] = joint_score
        scores["vp_shared_score"] = x1_and_x2_score
        scores["vp_x1_unique_score"] = vp_x1_unique_score
        scores["vp_x2_unique_score"] = vp_x2_unique_score
        del x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score
        print(scores.head())

        x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score = residual_method(
            Xs, Y, n_samples_train)
        print("residual method done")
        scores["rm_x2_to_x1_score"] = np.concatenate(
            [x2_to_x1_score, np.full(len(rm_x1_unique_score) - len(x2_to_x1_score), np.nan)])
        scores["rm_x1_to_x2_score"] = np.concatenate(
            [x1_to_x2_score, np.full(len(rm_x1_unique_score) - len(x1_to_x2_score), np.nan)])
        scores["rm_x1_unique_score"] = rm_x1_unique_score
        scores["rm_x2_unique_score"] = rm_x2_unique_score
        del x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score

        # ridge residual
        x2_to_x1_ridge_score, x1_to_x2_ridge_score, rm_ridge_x1_unique_score, rm_ridge_x2_unique_score = residual_method(
            Xs, Y, n_samples_train, use_ols=False)
        print("residual method done")
        scores["rm_ridge_x2_to_x1_score"] = np.concatenate(
            [x2_to_x1_ridge_score, np.full(len(rm_ridge_x1_unique_score) - len(x2_to_x1_ridge_score), np.nan)])
        scores["rm_ridge_x1_to_x2_score"] = np.concatenate(
            [x1_to_x2_ridge_score, np.full(len(rm_ridge_x1_unique_score) - len(x1_to_x2_ridge_score), np.nan)])
        scores["rm_ridge_x1_unique_score"] = rm_ridge_x1_unique_score
        scores["rm_ridge_x2_unique_score"] = rm_ridge_x2_unique_score
        del x2_to_x1_ridge_score, x1_to_x2_ridge_score, rm_ridge_x1_unique_score, rm_ridge_x2_unique_score
        del Xs, Y
        scores.to_csv(csv_path, index=False)

# Save scores for varying noise levels

In [14]:
backend = set_backend("torch_cuda")
warnings.filterwarnings("ignore")
random.seed(42)
plt.style.use('nord-light-talk')

In [15]:
nstep = 21
noise_levels = np.linspace(0, 1, nstep)
noise_levels

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])

In [None]:
save_scores(noise_levels)

0.0


# Plot scores

In [None]:
vp = pd.DataFrame()
rm = pd.DataFrame()
rm_ridge = pd.DataFrame()

all_levels = get_experiments()
print(all_levels)
for i, noise_level in enumerate(all_levels):
    scores = pd.read_csv(os.path.join(get_path(), f"scores_{noise_level}.csv"))
    vp_x1_unique_predicted = scores['vp_x1_unique_score']
    vp_scores = pd.DataFrame({
        'noise_target': [noise_level] * len(vp_x1_unique_predicted),
        'vp_x1_unique_score': vp_x1_unique_predicted,
    }, index=range(len(vp_x1_unique_predicted)))

    rm_x1_unique_score = scores['rm_x1_unique_score']
    rm_scores = pd.DataFrame({
        'noise_target': [noise_level] * len(vp_x1_unique_predicted),
        'rm_x1_unique_score': rm_x1_unique_score,
    }, index=range(len(vp_x1_unique_predicted)))

    rm_ridge_x1_unique_score = scores['rm_ridge_x1_unique_score']
    rm_ridge_scores = pd.DataFrame({
        'noise_target': [noise_level] * len(vp_x1_unique_predicted),
        'rm_ridge_x1_unique_score': rm_ridge_x1_unique_score,
    }, index=range(len(vp_x1_unique_predicted)))

    vp = pd.concat([vp, vp_scores], ignore_index=True)
    rm = pd.concat([rm, rm_scores], ignore_index=True)
    rm_ridge = pd.concat([rm_ridge, rm_ridge_scores], ignore_index=True)
vp.head()

In [None]:
# fig, ax = plt.subplots(figsize=(4, 4))
sns.lineplot(data=rm_ridge, x='noise_target', y='rm_ridge_x1_unique_score', label='Residual Method (Ridge)',
             errorbar='sd')
sns.lineplot(data=rm, x='noise_target', y='rm_x1_unique_score', label='Residual Method (OLS)', errorbar='sd')
sns.lineplot(data=vp, x='noise_target', y='vp_x1_unique_score', label='Variance Partitioning', errorbar='sd')
expected_values = 1 / 3 * (1 - noise_levels)
plt.plot(noise_levels, expected_values, linestyle='--', label='Expected Value')
plt.xlabel(r"$a_E$")
# plt.xticks(noise_levels, labels=[f"{noise:.1f}" for noise in noise_levels])
plt.ylabel(r"$R^2$ (mean and sd)")
plt.legend(bbox_to_anchor=(0.3, 1), loc='upper left')