In [1]:
# Install required packages
%%capture
!pip install anndata
!pip install muon
!pip install scanpy

In [2]:
# Import required modules
import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc

import os

from google.colab import drive
from muon import prot
from scipy.sparse import csc_matrix

In [3]:
# Mount drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# Declare constants
DATA_DIR = '/content/gdrive/MyDrive/Thesis/dance/dance/data'
INPUT_SUBTASK = 'openproblems_competition_cite_fltr_raw_rna'
OUTPUT_SUBTASK = 'openproblems_competition_cite_fltr_prep_rna'
RANDOM_SEED = 123

In [5]:
## GENE EXPRESSION PREPROCESSING
# Read h5ad files containing raw GEX counts
input_train_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK, 
                                             f'{INPUT_SUBTASK}.censor_dataset.output_train_mod1.h5ad'))

input_test_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK, 
                                            f'{INPUT_SUBTASK}.censor_dataset.output_test_mod1.h5ad'))

In [6]:
# Concatenate train and test sets for preprocessing
input_mod1 = ad.concat([input_train_mod1, input_test_mod1])

In [8]:
input_mod1.var['feature_types'] = 'GEX'

In [10]:
# Perform GEX preprocessing steps using the scanpy package
sc.pp.normalize_per_cell(input_mod1)
sc.pp.log1p(input_mod1)

In [11]:
# Split the data again into the output train and test data
output_train_mod1 = input_mod1[:input_train_mod1.shape[0] , :]
output_test_mod1 = input_mod1[input_train_mod1.shape[0]: , :]

In [12]:
# Check output
print(output_train_mod1)
print(output_test_mod1)

View of AnnData object with n_obs × n_vars = 63889 × 10000
    obs: 'batch', 'donor', 'cell_type', 'n_counts'
    var: 'feature_types'
    uns: 'log1p'
View of AnnData object with n_obs × n_vars = 7099 × 10000
    obs: 'batch', 'donor', 'cell_type', 'n_counts'
    var: 'feature_types'
    uns: 'log1p'


In [13]:
# Output transformed GEX data
os.makedirs(os.path.join(DATA_DIR, OUTPUT_SUBTASK), exist_ok=True)

output_train_mod1.write_h5ad(os.path.join(DATA_DIR, 
                                          OUTPUT_SUBTASK, 
                                          f'{OUTPUT_SUBTASK}.censor_dataset.output_train_mod1.h5ad'))
output_test_mod1.write_h5ad(os.path.join(DATA_DIR, 
                                         OUTPUT_SUBTASK, 
                                         f'{OUTPUT_SUBTASK}.censor_dataset.output_test_mod1.h5ad'))

  df[key] = c


In [14]:
## PROTEIN PREPROCESSING
# Load AnnData file containg raw protein counts
input_train_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK, 
                                             f'{INPUT_SUBTASK}.censor_dataset.output_train_mod2.h5ad'))
input_test_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK, 
                                            f'{INPUT_SUBTASK}.censor_dataset.output_test_mod2.h5ad'))

In [15]:
# Concatenate train and test sets for preprocessing
input_mod2 = ad.concat([input_train_mod2, input_test_mod2])

In [20]:
input_mod2.var['feature_types'] = 'ADT'

In [21]:
# Perform CLR transformation on protein AnnData using the muon package
prot.pp.clr(input_mod2)

  warn("adata.X is sparse but not in CSC format. Converting to CSC.")


In [22]:
# Split the data again into the output train and test data
output_train_mod2 = input_mod2[:input_train_mod2.shape[0], :]
output_test_mod2 = input_mod2[input_train_mod2.shape[0]:, :]

In [23]:
# Output transformed protein data
output_train_mod2.write_h5ad(os.path.join(DATA_DIR, 
                                          OUTPUT_SUBTASK, 
                                          f'{OUTPUT_SUBTASK}.censor_dataset.output_train_mod2.h5ad'))
output_test_mod2.write_h5ad(os.path.join(DATA_DIR, 
                                         OUTPUT_SUBTASK, 
                                         f'{OUTPUT_SUBTASK}.censor_dataset.output_test_mod2.h5ad'))

  df[key] = c


In [24]:
# Check output
print(output_train_mod2)
print(output_test_mod2)

AnnData object with n_obs × n_vars = 63889 × 140
    obs: 'batch', 'donor', 'cell_type'
    var: 'feature_types'
AnnData object with n_obs × n_vars = 7099 × 140
    obs: 'batch', 'donor', 'cell_type'
    var: 'feature_types'
