In [1]:
# Install required packages
%%capture
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip uninstall cupy-cuda115 -y
!pip install cupy-cuda112
!pip install anndata

In [2]:
import cupy
import gc
import logging
import os
import time
import tqdm

import anndata as ad
import pandas as pd
import numpy as np

from datetime import datetime
from google.colab import drive
from scipy import stats
from cuml.ensemble import RandomForestRegressor
from sklearn.feature_selection import r_regression, SelectKBest
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
## MOUNT DRIVE
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [18]:
## DATA DIRECTORY CONSTANT
DATA_DIR = '/content/gdrive/My Drive/Thesis/dance/dance/data/'
PREDS_DIR = '/content/gdrive/My Drive/Thesis/dance/dance/data/predictions'
SUBTASK = 'openproblems_competition_cite_fltr_prep_altstrat_denoised_rna'
RANDOM_SEED = 123

In [19]:
## FILE LOAD AND TRAIN VAL SPLIT CREATION
# Load anndata files
input_train_mod1 = ad.read_h5ad(os.path.join(
    DATA_DIR, SUBTASK, f'{SUBTASK}.censor_dataset.output_train_mod1.h5ad'))
input_train_mod2 = ad.read_h5ad(os.path.join(
    DATA_DIR, SUBTASK, f'{SUBTASK}.censor_dataset.output_train_mod2.h5ad'))
input_test_mod1 = ad.read_h5ad(os.path.join(
    DATA_DIR, SUBTASK, f'{SUBTASK}.censor_dataset.output_test_mod1.h5ad'))
input_test_mod2 = ad.read_h5ad(os.path.join(
    DATA_DIR, SUBTASK, f'{SUBTASK}.censor_dataset.output_test_mod2.h5ad'))

In [20]:
# Save gene and protein names before deleting variables
gene_names = input_train_mod1.var_names
prot_names = input_train_mod2.var_names

In [21]:
# Create random indexes for splits if task is not altstrat
if 'altstrat' not in SUBTASK:
  if RANDOM_SEED:
    np.random.seed(RANDOM_SEED)
  idx = np.random.permutation(input_train_mod1.shape[0])
  train_idx = idx[:int(idx.shape[0] * 0.88889)]
  val_idx = idx[int(idx.shape[0] * 0.88889):]

else:
  train_bool = input_train_mod1.obs['cell_type'] != 'MkP'
  val_bool = input_train_mod1.obs['cell_type'] == 'MkP'

  train_idx = input_train_mod1.obs.index[train_bool]
  val_idx = input_train_mod1.obs.index[val_bool]

train_mod1 = cupy.asarray(input_train_mod1[train_idx,:].X.toarray())
train_mod2 = cupy.asarray(input_train_mod2[train_idx,:].X.toarray())

val_mod1 = cupy.asarray(input_train_mod1[val_idx,:].X.toarray())
val_mod2 = cupy.asarray(input_train_mod2[val_idx,:].X.toarray())

test_mod1 = cupy.asarray(input_test_mod1.X.toarray())
test_mod2 = cupy.asarray(input_test_mod2.X.toarray())

In [22]:
del input_train_mod1
del input_train_mod2
gc.collect()

1133

In [23]:
start = time.process_time()
coefs = cupy.corrcoef(train_mod1, train_mod2, rowvar=False)
coefs = coefs[-train_mod2.shape[1]:, :-train_mod2.shape[1]]
coefs[np.isnan(coefs)] = 0.0
coefs = np.abs(coefs)
sorted_coefs = np.argsort(coefs)[:, -100:]
# Create empty array to insert predictions
pred_arr = cupy.zeros(input_test_mod2.shape)

start = time.process_time()
for idx in range(input_test_mod2.shape[1]):

    # Create dataset splits based on protein index and associated coefficients
    y_train = train_mod2[:, idx].ravel()
    X_train = train_mod1[:, sorted_coefs[idx, -100:]]
    X_test = test_mod1[:, sorted_coefs[idx, -100:]]

    # Create RF based on parameters found by Xu et al. (2021)
    rf = RandomForestRegressor(n_estimators=100, max_depth=20)
    rf.fit(X_train, y_train)
    pred_arr[:, idx] = rf.predict(X_test)

end = time.process_time()
print('Time elapsed:%.2f seconds'%(end-start))

Time elapsed:939.44 seconds


In [24]:
np.save(os.path.join(PREDS_DIR, f'rf_{SUBTASK}'), pred_arr.get())