In [None]:
!pip install anndata

Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
ModuleNotFoundError: No module named 'pip'


In [None]:
!pip install scanpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scanpy
  Downloading scanpy-1.9.1-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 25.2 MB/s 
Collecting matplotlib>=3.4
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 36.4 MB/s 
Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting umap-learn>=0.3.10
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 8.3 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.38.0-py3-none-any.whl (965 kB)
[K     |████████████████████████████████| 965 kB 48.7 MB/s 
Collecting contourpy>=1.0.1
  Downloading contourpy-1.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (295 kB)
[K     |████████████████████████████████| 295 kB 83.3 MB/s 
Collecting pynndescent>=0.5
  Downloading pyn

In [None]:
!pip install hdf5plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdf5plugin
  Downloading hdf5plugin-4.0.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.9 MB)
[K     |████████████████████████████████| 16.9 MB 20.7 MB/s 
Installing collected packages: hdf5plugin
Successfully installed hdf5plugin-4.0.1


In [None]:
!pip install scikit-misc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-misc
  Downloading scikit_misc-0.1.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (8.8 MB)
[K     |████████████████████████████████| 8.8 MB 29.2 MB/s 
Installing collected packages: scikit-misc
Successfully installed scikit-misc-0.1.4


In [None]:
# Import required packages
import os
import h5py
import hdf5plugin
import shutil

import pandas as pd
import anndata as ad
import numpy as np
import scanpy as sc

from google.colab import drive
from scipy.sparse import csc_matrix

In [None]:
# Mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Declare useful constants
DATA_DIR = '/content/drive/MyDrive/Thesis/dance/dance/data'
INPUT_SUBTASK_NAME = 'openproblems_competition_cite_raw_rna'
OUTPUT_SUBTASK_NAME = 'openproblems_competition_cite_fltr_raw_rna'
RANDOM_SEED = 123

In [None]:
assert os.path.exists(os.path.join(DATA_DIR, INPUT_SUBTASK_NAME))

In [None]:
# Read h5ad files containing raw GEX counts
input_train_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK_NAME, 
                                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_train_mod1.h5ad'))

input_test_mod1 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK_NAME, 
                                            f'{INPUT_SUBTASK_NAME}.censor_dataset.output_test_mod1.h5ad'))

In [None]:
# Concatenate the train and test data to determine highly variable genes across
# both
input_mod1 = ad.concat([input_train_mod1, input_test_mod1])

In [None]:
# Rename the day obs column to batch to ensure compatibility with DANCE
input_mod1.obs.columns = ['batch', 'donor', 'cell_type']

In [None]:
# Add feature_type columns to ensure compatibility with DANCE
input_mod1.var['feature_types'] = 'GEX'

In [None]:
# Perform the highly variable genes calculation
sc.pp.highly_variable_genes(input_mod1,
                            flavor='seurat_v3', 
                            n_top_genes=10000, 
                            batch_key='batch')

In [None]:
# Split the data again into the output and test data
output_train_mod1 = input_mod1[:input_train_mod1.shape[0] , input_mod1.var['highly_variable'] == True]
output_test_mod1 = input_mod1[input_train_mod1.shape[0]: , input_mod1.var['highly_variable'] == True]

In [None]:
# Convert to CSC matrices
output_train_mod1_sparse = csc_matrix(output_train_mod1.X)
output_test_mod1_sparse = csc_matrix(output_test_mod1.X)

In [None]:
output_train_mod1.X = None
output_test_mod1.X = None
output_train_mod1.X = output_train_mod1_sparse
output_test_mod1.X = output_test_mod1_sparse

In [None]:
# Make the output directory and write the filtered GEX datasets
os.makedirs(os.path.join(DATA_DIR, OUTPUT_SUBTASK_NAME), exist_ok=True)

output_train_mod1.write(os.path.join(DATA_DIR,
                                     OUTPUT_SUBTASK_NAME, 
                                     f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod1.h5ad'))

output_test_mod1.write(os.path.join(DATA_DIR, 
                                    OUTPUT_SUBTASK_NAME, 
                                    f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod1.h5ad'))

In [None]:
# Read ADT datasets
input_train_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                             INPUT_SUBTASK_NAME, 
                                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))

input_test_mod2 = ad.read_h5ad(os.path.join(DATA_DIR, 
                                            INPUT_SUBTASK_NAME, 
                                            f'{INPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))

In [None]:
# Rename the day obs column to batch to ensure compatibility with DANCE
input_train_mod2.obs.columns = ['batch', 'donor', 'cell_type']
input_test_mod2.obs.columns = ['batch', 'donor', 'cell_type']

In [None]:
# Add feature_type columns to ensure compatibility with DANCE
input_train_mod2.var['feature_types'] = 'ADT'
input_test_mod2.var['feature_types'] = 'ADT'

In [None]:
# Convert to CSC matrices
input_train_mod2_sparse = csc_matrix(input_train_mod2.X)
input_test_mod2_sparse = csc_matrix(input_test_mod2.X)

In [None]:
input_train_mod2.X = None
input_test_mod2.X = None
input_train_mod2.X = input_train_mod2_sparse
input_test_mod2.X = input_test_mod2_sparse

In [None]:
# Write modified ADT datasets
input_train_mod2.write(os.path.join(DATA_DIR,
                                    OUTPUT_SUBTASK_NAME, 
                                    f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))

input_test_mod2.write(os.path.join(DATA_DIR, 
                                   OUTPUT_SUBTASK_NAME, 
                                   f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))

In [None]:
'''
# Copy the raw ADT datasets to the new directory
shutil.copyfile(os.path.join(DATA_DIR, 
                             INPUT_SUBTASK_NAME, 
                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'),
                os.path.join(DATA_DIR, 
                             OUTPUT_SUBTASK_NAME, 
                             f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))

shutil.copyfile(os.path.join(DATA_DIR, 
                             INPUT_SUBTASK_NAME, 
                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'),
                os.path.join(DATA_DIR, 
                             OUTPUT_SUBTASK_NAME, 
                             f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))
'''

"\n# Copy the raw ADT datasets to the new directory\nshutil.copyfile(os.path.join(DATA_DIR, \n                             INPUT_SUBTASK_NAME, \n                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'),\n                os.path.join(DATA_DIR, \n                             OUTPUT_SUBTASK_NAME, \n                             f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))\n\nshutil.copyfile(os.path.join(DATA_DIR, \n                             INPUT_SUBTASK_NAME, \n                             f'{INPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'),\n                os.path.join(DATA_DIR, \n                             OUTPUT_SUBTASK_NAME, \n                             f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))\n"