In [1]:
# Install required packages
!pip install anndata

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting anndata
  Downloading anndata-0.8.0-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.1/96.1 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: anndata
Successfully installed anndata-0.8.0


In [2]:
# Import required modules
import pandas as pd
import numpy as np
import anndata as ad

import os

from google.colab import drive

In [3]:
# Mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Declare constants
DATA_DIR = '/content/drive/MyDrive/Thesis/dance/dance/data'
INPUT_SUBTASK = 'openproblems_competition_cite_fltr_prep_denoised_rna'
OUTPUT_SUBTASK = 'openproblems_competition_cite_fltr_prep_altstrat_denoised_rna'
RANDOM_SEED = 123

In [5]:
# Load AnnData file containing GEX counts
input_train_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK, 
                                             f'{INPUT_SUBTASK}.censor_dataset.output_train_mod1.h5ad'))

input_test_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK, 
                                            f'{INPUT_SUBTASK}.censor_dataset.output_test_mod1.h5ad'))

In [6]:
# Load AnnData file containing protein counts
input_train_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK, 
                                             f'{INPUT_SUBTASK}.censor_dataset.output_train_mod2.h5ad'))
input_test_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK, 
                                            f'{INPUT_SUBTASK}.censor_dataset.output_test_mod2.h5ad'))

In [7]:
# Concatenate train and test sets for alternative stratification
input_mod1 = ad.concat([input_train_mod1, input_test_mod1])
input_mod2 = ad.concat([input_train_mod2, input_test_mod2])

In [8]:
input_mod1.var['feature_types'] = 'GEX'
input_mod2.var['feature_types'] = 'ADT'

In [9]:
# Perform sanity check and assess whether observation annotations are equal
assert(input_mod1.obs.index.equals(input_mod2.obs.index))
assert(input_mod1.obs['cell_type'].equals(input_mod2.obs['cell_type']))

In [10]:
# Perform alternative stratification based on cell type
output_train_mod1 = input_mod1[input_mod1.obs['cell_type'] != 'MasP', :]
output_test_mod1 = input_mod1[input_mod1.obs['cell_type'] == 'MasP', :]
output_train_mod2 = input_mod2[input_mod2.obs['cell_type'] != 'MasP', :]
output_test_mod2 = input_mod2[input_mod2.obs['cell_type'] == 'MasP', :]

In [11]:
# Check output
print(output_train_mod1)
print(output_test_mod1)
print(output_train_mod2)
print(output_test_mod2)

View of AnnData object with n_obs × n_vars = 62746 × 10000
    obs: 'batch', 'donor', 'cell_type', 'n_counts'
    var: 'feature_types'
View of AnnData object with n_obs × n_vars = 8242 × 10000
    obs: 'batch', 'donor', 'cell_type', 'n_counts'
    var: 'feature_types'
View of AnnData object with n_obs × n_vars = 62746 × 140
    obs: 'batch', 'donor', 'cell_type'
    var: 'feature_types'
View of AnnData object with n_obs × n_vars = 8242 × 140
    obs: 'batch', 'donor', 'cell_type'
    var: 'feature_types'


In [12]:
# Output selected data
os.makedirs(os.path.join(DATA_DIR, OUTPUT_SUBTASK), exist_ok=True)

output_train_mod1.write_h5ad(os.path.join(DATA_DIR, 
                                          OUTPUT_SUBTASK, 
                                          f'{OUTPUT_SUBTASK}.censor_dataset.output_train_mod1.h5ad'))
output_test_mod1.write_h5ad(os.path.join(DATA_DIR, 
                                         OUTPUT_SUBTASK, 
                                         f'{OUTPUT_SUBTASK}.censor_dataset.output_test_mod1.h5ad'))

output_train_mod2.write_h5ad(os.path.join(DATA_DIR, 
                                          OUTPUT_SUBTASK, 
                                          f'{OUTPUT_SUBTASK}.censor_dataset.output_train_mod2.h5ad'))
output_test_mod2.write_h5ad(os.path.join(DATA_DIR, 
                                         OUTPUT_SUBTASK, 
                                         f'{OUTPUT_SUBTASK}.censor_dataset.output_test_mod2.h5ad'))

  df[key] = c
