In [3]:
%retina

UsageError: Line magic function `%retina` not found.


In [4]:
%matplotlib inline
from base import B2B, Forward, CrossDecomp, Backward, r_score
from sklearn.cross_decomposition import CCA, PLSRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import scale

import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
import pickle
import time
import submitit
import seaborn as sns
import pandas as pd
from tqdm import trange
from itertools import product

In [5]:
class Synthetic(object):
    def __init__(self,
                 dim_x=50,         # number of features
                 dim_y=30,         # number of sensors
                 nc=5,             # number of selected features
                 snr=1.0,          # signal-to-noise ratio
                 nonlinear=False):  # number of selected features
        
        # linear transformation
        self.F = np.random.randn(dim_x, dim_y) / np.sqrt(dim_x)

        # masking transformation
        self.E = np.array([0] * (dim_x - nc) + [1] * (nc))

        # features covariance
        self.cov_X = np.random.randn(dim_x, dim_x) / np.sqrt(dim_x)
        self.cov_X = self.cov_X @ self.cov_X.T
        
        # noise covariance
        self.cov_N = np.random.randn(dim_x, dim_x) / np.sqrt(dim_x)
        self.cov_N = self.cov_N @ self.cov_N.T
        
        self.dim_x = dim_x
        self.dim_y = dim_y
        self.nonlinear = nonlinear
        self.snr = snr

    def sample(self, n_samples=1000):
        X = np.random.multivariate_normal(np.zeros(self.dim_x),
                                          self.cov_X, n_samples)
        N = np.random.multivariate_normal(np.zeros(self.dim_x),
                                          self.cov_N, n_samples)

        # observed sensor data
        Y = (self.snr * X @ np.diag(self.E) + N) @ self.F

        if self.nonlinear:
            Y = 1. / (1. + np.exp(-Y))

        # return inputs, outputs, and solution
        return scale(X), scale(Y)


In [6]:
def GridCCA(n_components):
    grid = dict(n_components=np.unique(np.floor(np.linspace(1, n_components, 10)).astype(int)))
    return CrossDecomp(GridSearchCV(CCA(max_iter=1000), grid, cv=5))


def GridPLS(n_components):
    grid = dict(n_components=np.unique(np.floor(np.linspace(1, n_components, 10)).astype(int)))
    return CrossDecomp(GridSearchCV(PLSRegression(max_iter=1000), grid, cv=5))


models = {
    "B2B": B2B,
    "Forward": Forward,
    "Backward": Backward,
    "GridCCA": GridCCA,
    "GridPLS": GridPLS,
}

In [11]:
def run(args=dict()):
    import warnings
    warnings.filterwarnings("ignore")
    
    n_samples = args.get('n_samples', 1000)
    dim_x = args.get('dim_x', 100)
    dim_y = args.get('dim_y', 100)
    snr = args.get('.snr', 1)
    nc = args.get('nc', 5)
    nonlinear = args.get('nonlinear', 0)
    n_seeds = args.get('n_seeds', 10)
        
    results = []

    for seed in range(n_seeds):
        np.random.seed(seed)

        # Make environment
        synthetic = Synthetic(dim_x, dim_y, nc,
                              snr, nonlinear)

        # Make data
        X_train, Y_train = synthetic.sample(n_samples)
        X_test, Y_test = synthetic.sample(n_samples * 10)

        for m, Model in models.items():
            
            model = Model(min(dim_x, dim_y)) if 'Grid' in m else Model()

            # fit model on training data
            start = time.time()
            model.fit(X_train, Y_train)
            duration = time.time() - start

            # Estimate effect from model parameters
            auc = roc_auc_score(synthetic.E, model.E_)

            # Estimate effect from prediction reliability on held-out data
            r_full = model.score(X_test, Y_test)
            r_ko = model.score_knockout(X_test, Y_test)

            r_delta = r_full - r_ko

            r_in = r_delta[synthetic.E==1].mean()
            r_out = r_delta[synthetic.E==0].mean()

            # Store results
            result = dict(dim_x=dim_x, dim_y=dim_y, nc=nc, snr=snr, 
                          nonlinear=nonlinear)
            result["model"] = m
            result["seed"] = seed
            result["r_in"] = r_in
            result["r_out"] = r_out
            result["auc"] = auc
            result['id'] = '_'.join(map(str, [dim_x, dim_y, nc, snr, nonlinear, 
                                              m, seed]))
            result['duration'] = duration
            print(result)
            results.append(result)
    return results

In [8]:
executor = submitit.AutoExecutor(folder='./synthetic/')
executor.update_parameters(timeout_min=60, 
                           partition='learnfair,uninterrupted,scavenge',
                           constraint='pascal', cpus_per_task=12)

In [12]:
jobs = list()
snrs = np.logspace(-3, 3, 10)
dim_xs = np.logspace(1, 3, 10).astype(int)
dim_ys = np.logspace(1, 3, 10).astype(int)
ncs = np.logspace(0, 2, 10).astype(int)

for snr, dim_x, dim_y, nc in product(snrs, dim_xs, dim_ys, ncs):
    args = dict(snr=snr, n_seeds=5,
                dim_x=dim_x, dim_y=dim_y, nc=nc)
    jobs.append([args,])
jobs = executor.map_array(run, *zip(*jobs))

In [13]:
print(len(jobs))

10000


In [16]:
j = jobs[0]

In [None]:
j.re

In [None]:
results = list()
for job in jobs:
    print(len(results))
    try:
        results.extend(job.results())
    except:
        pass

0
1
2
3
4
5
5
5
5
