In [1]:
import os
import random
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
from himalaya.backend import set_backend

from compare_variance_residual.residual import residual_method
from compare_variance_residual.simulation import generate_dataset
from compare_variance_residual.variance_partitioning import variance_partitioning


def get_path(alphas, cv, n_samples, n_targets, noise_target):
    path = os.path.join("results", f"targets={n_targets}", f"samples={n_samples}", f"noise={noise_target}", f"cv={cv}",
                        f"alphas={alphas.min()},{alphas.max()},{len(alphas)}", "varying dimensions")
    os.makedirs(path, exist_ok=True)
    return path

In [2]:
def save_scores(d_list_list, scalars, n_targets, n_samples, noise_target, cv, alphas):
    path = get_path(alphas, cv, n_samples, n_targets, noise_target)
    for d_list in d_list_list:
        print(d_list)
        csv_path = os.path.join(path, f"scores_{d_list}.csv")
        scores = pd.DataFrame()
        if os.path.exists(csv_path):
            print("skipping, already exists")
            continue
        Xs, Y = generate_dataset(d_list, scalars, n_targets, n_samples, noise_target)
        print("data generated")
        x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score = variance_partitioning(
            Xs, Y, n_samples_train, alphas, cv)
        print("variance partitioning done")

        scores["x1_score"] = x1_score
        scores["x2_score"] = x2_score
        scores["vp_joint_score"] = joint_score
        scores["vp_shared_score"] = x1_and_x2_score
        scores["vp_x1_unique_score"] = vp_x1_unique_score
        scores["vp_x2_unique_score"] = vp_x2_unique_score
        del x1_score, x2_score, joint_score, x1_and_x2_score, vp_x1_unique_score, vp_x2_unique_score
        print(scores.head())

        _, _, x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score = residual_method(
            Xs, Y, n_samples_train, alphas, cv)
        print("residual method done")
        scores["rm_x2_to_x1_score"] = np.concatenate(
            [x2_to_x1_score, np.full(len(rm_x1_unique_score) - len(x2_to_x1_score), np.nan)])
        scores["rm_x1_to_x2_score"] = np.concatenate(
            [x1_to_x2_score, np.full(len(rm_x1_unique_score) - len(x1_to_x2_score), np.nan)])
        scores["rm_x1_unique_score"] = rm_x1_unique_score
        scores["rm_x2_unique_score"] = rm_x2_unique_score
        del x2_to_x1_score, x1_to_x2_score, rm_x1_unique_score, rm_x2_unique_score
        del Xs, Y
        scores.to_csv(csv_path, index=False)

# Save scores for varying Dimensions

In [3]:
backend = set_backend("cupy", on_error="warn")
warnings.filterwarnings("ignore")
random.seed(42)

In [4]:
d_list = [100, 100, 100]
n_targets = 10000
n_samples_train = 10000
n_samples_test = 100
n_samples = n_samples_train + n_samples_test
noise_target = 0.1
scalars_list = [1 / 3, 1 / 3, 1 / 3]

cv = 10
alphas = np.logspace(-5, 5, 10)

In [5]:
varying_dim = np.logspace(1, 3, 10, dtype=int)


## Shared dimension

In [6]:
d_list_list = [[dim, d_list[1], d_list[2]] for dim in varying_dim]

In [None]:
save_scores(d_list_list, d_list, n_targets, n_samples, noise_target, cv, alphas)

[np.int64(10), 100, 100]
data generated
variance partitioning done
   x1_score  x2_score  vp_joint_score  vp_shared_score  vp_x1_unique_score  \
0  0.541758  0.687327        0.919659         0.309426            0.232332   
1  0.470424  0.549045        0.863074         0.156395            0.314029   
2  0.583584  0.681983        0.895606         0.369961            0.213623   
3  0.596878  0.602543        0.884479         0.314942            0.281936   
4  0.538621  0.614795        0.879366         0.274050            0.264572   

   vp_x2_unique_score  
0            0.377901  
1            0.392650  
2            0.312022  
3            0.287601  
4            0.340745  
residual method done
[np.int64(16), 100, 100]
data generated
variance partitioning done
   x1_score  x2_score  vp_joint_score  vp_shared_score  vp_x1_unique_score  \
0  0.633584  0.495717        0.900534         0.228766            0.404818   
1  0.635178  0.667356        0.898666         0.403869            0.231310  

In [None]:
vp_x1_unique_predicted = []
vp_x2_unique_predicted = []
rm_x1_unique_predicted = []
rm_x2_unique_predicted = []

for d_list in d_list_list:
    scores = pd.read_csv(os.path.join(get_path(alphas, cv, n_samples, n_targets, noise_target), f"scores_{d_list}.csv"))
    vp_x1_unique_predicted.append(scores["vp_x1_unique_score"])
    vp_x2_unique_predicted.append(scores["vp_x2_unique_score"])
    rm_x1_unique_predicted.append(scores["rm_x1_unique_score"])
    rm_x2_unique_predicted.append(scores["rm_x2_unique_score"])

In [None]:
sns.catplot(data=pd.DataFrame(vp_x1_unique_predicted).T)