In [None]:
# Install required packages
%%capture
!pip install anndata
!pip install hdf5plugin
!pip install muon
!pip install scikit_misc

In [None]:
import pandas as pd
import numpy as np
import anndata as ad

import h5py
import hdf5plugin
import os
import scanpy

from google.colab import drive
from muon import prot
from scipy.sparse import csc_matrix

In [None]:
## MOUNT GOOGLE DRIVE
drive.mount('/content/drive')

In [None]:
## CONSTANTS
DATA_DIR = '/content/drive/My Drive/Thesis/dance/dance/data'
OUTPUT_SUBTASK_NAME = 'mouse_liver_cite_fltr_rna'
RANDOM_SEED = 123

In [None]:
annot_df = pd.read_csv(os.path.join(DATA_DIR, 'citeseq_mouse_annot.csv'))
rna_df = pd.read_csv(os.path.join(DATA_DIR, 'citeseq_mouse_RNA.csv'))
adt_df = pd.read_csv(os.path.join(DATA_DIR, 'citeseq_mouse_ADT.csv'))

In [None]:
rna_df.shape

In [None]:
adt_df.shape

In [None]:
cols = annot_df.columns
annot_df = annot_df.reset_index().iloc[:, :-4]
annot_df.columns = cols[:-2]
annot_df['batch'] = 1
annot_df.loc[annot_df['sample'] == 'WT4', 'batch'] = 2

In [None]:
rna_df = rna_df.T
rna_df.columns = rna_df.iloc[0, :]
rna_df = rna_df.iloc[1:, :]

In [None]:
adt_df = adt_df.T
adt_df.columns = adt_df.iloc[0, :]
adt_df = adt_df.iloc[1:, :]

In [None]:
# Create shuffled indexes in order to partition the data into train and test set
if RANDOM_SEED:
    np.random.seed(RANDOM_SEED)
ten_percent_part = round(rna_df.shape[0] * 0.1)
idx = np.random.permutation(rna_df.shape[0])
train_idx = sorted(idx[:-ten_percent_part])
test_idx = sorted(idx[-ten_percent_part:])

In [None]:
# Put numpy array into AnnData object
input_train_mod1 = ad.AnnData(rna_df.iloc[train_idx, :])
input_test_mod1 = ad.AnnData(rna_df.iloc[test_idx, :])

In [None]:
input_train_mod2 = ad.AnnData(adt_df.iloc[train_idx, :])
input_test_mod2 = ad.AnnData(adt_df.iloc[test_idx, :])

In [None]:
# Add axis names
input_train_mod1.obs = annot_df.loc[train_idx, :]
input_train_mod1.var_names = rna_df.columns
input_test_mod1.obs = annot_df.loc[test_idx, :]
input_test_mod1.var_names = rna_df.columns

In [None]:
input_train_mod2.obs = annot_df.iloc[train_idx, :]
input_train_mod2.var_names = adt_df.columns
input_test_mod2.obs = annot_df.iloc[test_idx, :]
input_test_mod2.var_names = adt_df.columns

In [None]:
input_mod1 = ad.concat([input_train_mod1, input_test_mod1])

In [None]:
scanpy.pp.highly_variable_genes(input_mod1,
                                flavor='seurat_v3', 
                                n_top_genes=6000, 
                                batch_key='batch')

In [None]:
input_mod1 = input_mod1[:, input_mod1.var['highly_variable'] == True]

In [None]:
scanpy.pp.normalize_per_cell(input_mod1)

In [None]:
scanpy.pp.log1p(input_mod1)

In [None]:
# Split the data again into the output and test data
output_train_mod1 = input_mod1[:input_train_mod1.shape[0] , :]
output_test_mod1 = input_mod1[input_train_mod1.shape[0]: , :]

In [None]:
input_mod2 = ad.concat([input_train_mod2, input_test_mod2])

In [None]:
prot.pp.clr(input_mod2)

In [None]:
# Split the data again into the output and test data
output_train_mod2 = input_mod2[:input_train_mod2.shape[0] , :]
output_test_mod2 = input_mod2[input_train_mod2.shape[0]: , :]

In [None]:
# Convert to CSC matrices
output_train_mod1_sparse = csc_matrix(output_train_mod1.X)
output_test_mod1_sparse = csc_matrix(output_test_mod1.X)
output_train_mod2_sparse = csc_matrix(output_train_mod2.X)
output_test_mod2_sparse = csc_matrix(output_test_mod2.X)

In [None]:
output_train_mod1.X = None
output_test_mod1.X = None
output_train_mod2.X = None
output_test_mod2.X = None

In [None]:
output_train_mod1.X = output_train_mod1_sparse
output_test_mod1.X = output_test_mod1_sparse
output_train_mod2.X = output_train_mod2_sparse
output_test_mod2.X = output_test_mod2_sparse

In [None]:
output_train_mod1.var['feature_types'] = 'GEX'
output_test_mod1.var['feature_types'] = 'GEX'
output_train_mod2.var['feature_types'] = 'ADT'
output_test_mod2.var['feature_types'] = 'ADT'

In [None]:
# Make the output directory and write the filtered GEX datasets
os.makedirs(os.path.join(DATA_DIR, OUTPUT_SUBTASK_NAME), exist_ok=True)

output_train_mod1.write(os.path.join(DATA_DIR,
                                     OUTPUT_SUBTASK_NAME, 
                                     f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod1.h5ad'))

output_test_mod1.write(os.path.join(DATA_DIR, 
                                    OUTPUT_SUBTASK_NAME, 
                                    f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod1.h5ad'))

In [None]:
# Write modified ADT datasets
output_train_mod2.write(os.path.join(DATA_DIR,
                                    OUTPUT_SUBTASK_NAME, 
                                    f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_train_mod2.h5ad'))

output_test_mod2.write(os.path.join(DATA_DIR, 
                                   OUTPUT_SUBTASK_NAME, 
                                   f'{OUTPUT_SUBTASK_NAME}.censor_dataset.output_test_mod2.h5ad'))

In [None]:
test = ad.read_h5ad(DATA_DIR + '/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad')                                    

In [None]:
test.obs_names