In [1]:
1*2

2

In [2]:
import sys
sys.path.insert(1, '../')

import os
import GPUtil
import importlib

def sgpu():
    GPUtil.showUtilization()

def rl(module):
    importlib.reload(module)

In [3]:
sgpu()

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |
|  1 |  0% |  0% |
|  2 |  0% |  0% |
|  3 |  0% |  0% |
|  4 |  0% |  0% |
|  5 |  0% |  0% |
|  6 |  0% |  0% |
|  7 |  0% |  0% |


In [4]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import torch
import gpytorch

from scipy.stats import pearsonr
from scipy.special import binom as binom
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

In [5]:
import EpiK.models as models

In [6]:
output_device = 0
n_devices = torch.cuda.device_count()
models.set_params(output_device, n_devices)
print("number of GPUs = {}; output device = {}".
      format(n_devices, torch.cuda.current_device()))

number of GPUs = 8; output device = 0


In [7]:
from EpiK.functions import get_data, get_envs, set_data_path
set_data_path("../matsui_data/")

In [8]:
# training sizes
props = [.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95]

In [9]:
# check_point sizes
partitions = [2, 2, 2, 2, 4, 4, 4, 4, 16, 32, 64, 120]
pd.DataFrame({"props":props, "partitions":partitions}).to_csv("partition_sizes.csv", index=None)

In [10]:
partition_sizes = pd.read_csv("partition_sizes.csv")

### Data

In [11]:
env_list = get_envs()
env = env_list[5]

In [12]:
geno_t, pheno = get_data(env)

  geno_t = torch.tensor(geno_t, dtype=torch.float)


In [13]:
inds_sub = np.where(np.array(pheno.pheno < -0.6) == False)[0]

### Loops to get R2 scores

In [14]:
import EpiK.functions
rl(EpiK.functions)
from EpiK.functions import train_model_cv

EpiK.functions.output_device = output_device
EpiK.functions.n_devices = n_devices

In [2]:
results = pd.read_csv("r2s_epik.csv")
results

NameError: name 'pd' is not defined

In [16]:
from EpiK.kernels import DiKernel
ker = DiKernel()
ker.raw_lda = torch.nn.Parameter(torch.tensor(-8.))
ker.raw_eta = torch.nn.Parameter(torch.tensor(-12.))

In [17]:
i = 11
np.random.seed(100)
train_size = np.round(props[i]*len(inds_sub)).astype('int')
sub = np.random.choice(inds_sub, train_size)
sub_t = np.random.choice(list(set(inds_sub).difference(sub)), 2000)
train_x = geno_t[sub]
train_y = torch.tensor(np.array(pheno.pheno[sub]), dtype=torch.float32)
test_x = geno_t[sub_t]
test_y = torch.tensor(np.array(pheno.pheno[sub_t]), dtype=torch.float32)
train_x, train_y = train_x.contiguous(), train_y.contiguous()
test_x, test_y = test_x.contiguous(), test_y.contiguous()
train_x, train_y = train_x.to(output_device), train_y.to(output_device)
test_x, test_y = test_x.to(output_device), test_y.to(output_device)

# train model
print("training GP model using CV")
ker, likelihood = train_model_cv(ker, train_x, train_y, 50, .1)


# make predictions - build model
torch.cuda.empty_cache()
model = models.ExactGPModel(train_x, train_y, likelihood, ker).to(output_device)


training GP model using CV
working on iteration 0
working on iteration 10
working on iteration 20
working on iteration 30
working on iteration 40


In [None]:
partition_size = partition_sizes.partitions[i]

model.eval()
likelihood.eval()
with gpytorch.beta_features.checkpoint_kernel(train_x.shape[0]//int(partition_size)):
    f_preds = model(test_x)

f_mean = f_preds.mean.cpu().detach().numpy()
y_test = test_y.detach().cpu().numpy()
r2_score = r2(y_test, f_mean)                
results.iloc[i, 2] = r2_score

results.to_csv("r2s_epik_220809.csv", index=False)

In [None]:
for i in range(len(props)):
    
    print("working on training data proportion = {}".format(props[i]))
    
    if results.iloc[i].r2 != 0.:
        print("r2_score found, skipping to next")
    
    else:
        print("no r2_score recorded, proceeding to calculate")
        np.random.seed(100)
        train_size = np.round(props[i]*len(inds_sub)).astype('int')
        sub = np.random.choice(inds_sub, train_size)
        sub_t = np.random.choice(list(set(inds_sub).difference(sub)), 4000)
        train_x = geno_t[sub]
        train_y = torch.tensor(np.array(pheno.pheno[sub]), dtype=torch.float32)
        test_x = geno_t[sub_t]
        test_y = torch.tensor(np.array(pheno.pheno[sub_t]), dtype=torch.float32)
        train_x, train_y = train_x.contiguous(), train_y.contiguous()
        test_x, test_y = test_x.contiguous(), test_y.contiguous()
        train_x, train_y = train_x.to(output_device), train_y.to(output_device)
        test_x, test_y = test_x.to(output_device), test_y.to(output_device)

        # train model
        print("training GP model using CV")
        ker, likelihood = train_model_cv(ker, train_x, train_y, 50, .1)
        

        # make predictions - build model
        torch.cuda.empty_cache()
        model = models.ExactGPModel(train_x, train_y, likelihood, ker).to(output_device)

        # make predictions - loop to increase partition_size until passes
        loop = True
        while loop:
            try: 
                partition_size = partition_sizes.iloc[i, 1]
                print("try doing inference under partition size = {}".format(partition_size))
                import gc
                gc.collect()
                torch.cuda.empty_cache()                

                model.eval()
                likelihood.eval()
                with gpytorch.beta_features.checkpoint_kernel(train_x.shape[0]//int(partition_size)):
                    f_preds = model(test_x)
                    
                f_mean = f_preds.mean.cpu().detach().numpy()
                y_test = test_y.detach().cpu().numpy()
                r2_score = r2(y_test, f_mean)                
                results.iloc[i, 2] = r2_score
                loop = False            

            except: 
                print("failed on current partition_size, increasing by 5")
                partition_sizes.iloc[i, 1] = partition_sizes.iloc[i,1] + 5
                partition_sizes.to_csv("partition_sizes.csv")            

working on training data proportion = 0.01
r2_score found, skipping to next
working on training data proportion = 0.05
r2_score found, skipping to next
working on training data proportion = 0.1
r2_score found, skipping to next
working on training data proportion = 0.2
r2_score found, skipping to next
working on training data proportion = 0.3
r2_score found, skipping to next
working on training data proportion = 0.4
r2_score found, skipping to next
working on training data proportion = 0.5
r2_score found, skipping to next
working on training data proportion = 0.6
r2_score found, skipping to next
working on training data proportion = 0.7
r2_score found, skipping to next
working on training data proportion = 0.8
r2_score found, skipping to next
working on training data proportion = 0.9
r2_score found, skipping to next
working on training data proportion = 0.99
no r2_score recorded, proceeding to calculate
training GP model using CV
working on iteration 0
working on iteration 10
working on



failed on current partition_size, increasing by 5
try doing inference under partition size = 105
failed on current partition_size, increasing by 5
try doing inference under partition size = 110
failed on current partition_size, increasing by 5
try doing inference under partition size = 115
failed on current partition_size, increasing by 5
try doing inference under partition size = 120
failed on current partition_size, increasing by 5
try doing inference under partition size = 125
failed on current partition_size, increasing by 5
try doing inference under partition size = 130
failed on current partition_size, increasing by 5
try doing inference under partition size = 135
failed on current partition_size, increasing by 5
try doing inference under partition size = 140
failed on current partition_size, increasing by 5
try doing inference under partition size = 145
failed on current partition_size, increasing by 5
try doing inference under partition size = 150
failed on current partition_si

In [23]:
results.to_csv("r2_epik.csv", index=None)