In [4]:
%%capture
# Install required packages
!pip install hdf5plugin
!pip install anndata
!pip install scanpy

In [5]:
# Import required packages
import os
import h5py
import hdf5plugin
import shutil

import pandas as pd
import anndata as ad
import numpy as np
import scanpy as sc

from google.colab import drive
from scipy.sparse import csc_matrix

In [6]:
# Mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# Declare useful constants
DATA_DIR = '/content/drive/MyDrive/Thesis/dance/dance/data'
INPUT_FILE = 'mouse_data_denoised_transp.h5'
SUBTASK_NAME = 'mouse_liver_cite_fltr_denoised_transp_rna'
RANDOM_SEED = 123

In [18]:
# Load HDF5 file and extract values as ndarray
denoised_vals = h5py.File(os.path.join(DATA_DIR, INPUT_FILE))
# denoised_vals = denoised_vals['dat']['x.autoencoder'][:].T

In [19]:
# Put numpy array into AnnData object
denoised_mod1 = ad.AnnData(denoised_vals['dat'][:].T)

  denoised_mod1 = ad.AnnData(denoised_vals['dat'][:].T)


In [10]:
# Perform GEX preprocessing steps using the scanpy package
sc.pp.normalize_per_cell(denoised_mod1)
sc.pp.log1p(denoised_mod1)

In [20]:
# Convert data to csc matrix
denoised_mod1.X = csc_matrix(denoised_mod1.X)

In [21]:
# Create shuffled indexes in order to partition the data into train and test set
if RANDOM_SEED:
    np.random.seed(RANDOM_SEED)
ten_percent_part = round(denoised_mod1.shape[0] * 0.1)
idx = np.random.permutation(denoised_mod1.shape[0])
train_idx = sorted(idx[:-ten_percent_part])
test_idx = sorted(idx[-ten_percent_part:])

In [22]:
# Put numpy array into AnnData object
denoised_train_mod1 = denoised_mod1[train_idx]
denoised_test_mod1 = denoised_mod1[test_idx]

In [23]:
# Extract metadata from existing annData subtask
# Make sure that the train and test indices are the same as the annData from 
# which to extract metadata!
if 'mouse' in INPUT_FILE:
  train_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                        'mouse_liver_cite_fltr_rna', 
                                        'mouse_liver_cite_fltr_rna' + \
                                        '.censor_dataset.output_train_mod1.h5ad'))
  
  test_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                        'mouse_liver_cite_fltr_rna', 
                                        'mouse_liver_cite_fltr_rna' + \
                                        '.censor_dataset.output_test_mod1.h5ad'))
elif 'human' in INPUT_FILE:
  train_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                        'openproblems_competition_cite_fltr_prep_rna', 
                                        'openproblems_competition_cite_fltr_prep_rna' + \
                                        '.censor_dataset.output_train_mod1.h5ad'))
  
  test_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                        'openproblems_competition_cite_fltr_prep_rna', 
                                        'openproblems_competition_cite_fltr_prep_rna' + \
                                        '.censor_dataset.output_test_mod1.h5ad'))

In [24]:
train_mod1.obs.shape

(22103, 6)

In [25]:
# Copy metadata
denoised_train_mod1.obs = train_mod1.obs
denoised_train_mod1.var = train_mod1.var

denoised_test_mod1.obs = test_mod1.obs
denoised_test_mod1.var = test_mod1.var

In [26]:
# Write AnnData to disk
os.makedirs(os.path.join(DATA_DIR, SUBTASK_NAME), exist_ok=True)
denoised_train_mod1.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_train_mod1.h5ad'))
denoised_test_mod1.write(os.path.join(DATA_DIR, SUBTASK_NAME, f'{SUBTASK_NAME}.censor_dataset.output_test_mod1.h5ad'))

In [27]:
# Copy ADT datasets over to new directory
if 'mouse' in INPUT_FILE:
  shutil.copy(os.path.join(DATA_DIR, 
                          'mouse_liver_cite_fltr_rna',
                          'mouse_liver_cite_fltr_rna.censor_dataset.output_train_mod2.h5ad'),

              os.path.join(DATA_DIR, 
                           SUBTASK_NAME, 
                           f'{SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'),)
  
  shutil.copy(os.path.join(DATA_DIR, 
                           'mouse_liver_cite_fltr_rna',
                           'mouse_liver_cite_fltr_rna.censor_dataset.output_test_mod2.h5ad'),
              
              os.path.join(DATA_DIR, 
                           SUBTASK_NAME, 
                           f'{SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))
elif 'human' in INPUT_FILE:
  shutil.copy(os.path.join(DATA_DIR, 
                          'openproblems_competition_cite_fltr_prep_rna',
                          'openproblems_competition_cite_fltr_prep_rna' + 
                          '.censor_dataset.output_train_mod2.h5ad'),

              os.path.join(DATA_DIR, 
                           SUBTASK_NAME, 
                           f'{SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))
  
  shutil.copy(os.path.join(DATA_DIR, 
                           'openproblems_competition_cite_fltr_prep_rna',
                           'openproblems_competition_cite_fltr_prep_rna' + 
                           '.censor_dataset.output_test_mod2.h5ad'),
              
              os.path.join(DATA_DIR, 
                           SUBTASK_NAME, 
                           f'{SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))