# Cell-type Classification on BMNC CITE Seq Data

The CITE-Seq data set consists of two modalities: RNA (Gene Expression) and Protein (Antibody Derived Tags \[ADTs\])

Here, we have also predicted ADT data from XGBoost and cTPnet, based on the measured gene expression.

In this notebook, cell-type classification will be done with the following as input data:
1. Measured gene expression only
2. Measured gene expression and ADT
3. Measured gene expression and XGBoost predicted ADT
4. Measured gene expression and cTPnet predicted ADT

(1) is the control.  
(2) uses a multi-omic approach to determine if the ADT data adds anything of value to the classification task.  
(3) and (4) are used to infer whether the ADT data can be predicted from the gene expression itself, in which case measuring ADTs would become unnecessary. Also, (3) and (4) can be compared to determine which method is better for prediciting ADT values.

In [1]:
# Loading necessary libraries and functions
import sys
sys.path.append('../')
from main import train_model
from experimental import process, train
import pandas as pd
import numpy as np

# File locations
rna_file = "../../CITEseq_BMNC/rna.csv"
adt_file = "../R/CITE_Seq_BMNC/adt.csv"
adt_xgboost_file = "../R/CITE_Seq_BMNC/adt_xgboost.csv"
adt_ctpnet_file = "../R/CITE_Seq_BMNC/adt_ctpnet.csv"
meta_file = "../R/CITE_Seq_BMNC/cell_type.csv"
trte_partition_file = "../R/CITE_Seq_BMNC/trte_partition.txt"

## Measured Gene Expression Only

In [2]:
# Setting all variables for run
# SEED can be "random" or integer, if integer, it will be used as the seed for random, numpy, torch, and cuda
SEED = 42 

# change label from text to integer
labels = pd.read_csv(meta_file, index_col="cell_id")
label_dict = dict(zip(set(labels["cell_type"]), range(len(set(labels["cell_type"])))))

COMBINER = False 
doSMOTE = False 

# Training parameters
num_epoch = 100
test_interval = 10
lr = 5e-4
weight_decay = 1e-3
dropout = 0.25
adj_parameter = 8 # average number of edge per node in adj matrix

VERBOSE = 1 #0, only print final result; 1, only testing result; 2, training and testing result
OUTPUT_FILES = False #Boolean to determine whether to output loss and metrics as csv files
MAKE_PLOTS = False #Boolean to determine whether to output loss and metrics as plots in png format
REPEATS = 1 #Integer, how many times to independently train the model
feature_extract = []

In [5]:
RUN_TITLE = "BMNC CITE-Seq RNA Only"
RUN_TITLE_SHORT = "RNA Only"

load_list = [rna_file, meta_file, trte_partition_file]
GCN_names = ["RNA"]

rna = pd.read_csv(rna_file, index_col="cell_id")
meta = pd.read_csv(meta_file, index_col="cell_id")

In [34]:
# Getting labels
#meta = pd.read_csv(meta_file, index_col="cell_id")
#meta = meta.reindex(index=mirna.index)
#labels.iloc[:].value_counts()
label_dict = dict(zip(set(meta["cell_type"]), range(len(set(meta["cell_type"])))))
#labels = [label_dict[x] for x in meta_smol.iloc[:].tolist()]


# Getting training and testing indices
patient_id = rna.index.to_numpy()
with open(trte_partition_file, 'r') as f:
    lines = f.readlines()

tr_patient_id_list = lines[1].strip().split(',')
te_patient_id_list = lines[3].strip().split(',')
tr_idx = [np.where(patient_id == pid)[0][0] for pid in tr_patient_id_list]
te_idx = [np.where(patient_id == pid)[0][0] for pid in te_patient_id_list]
te_idx = sorted(te_idx)

rna_train = rna.iloc[tr_idx,:]
rna_test = rna.iloc[te_idx,:]

meta_train = meta.iloc[tr_idx,:]
meta_test = meta.iloc[te_idx,:]

In [38]:
# A mixture of oversampling and undersampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Setting the target so that the data is undersampled so that every cell_type that has more than 200 samples is reduced to 200
target = meta_train["cell_type"].value_counts().to_dict()
for key in target:
    target[key] = min(target[key],200)

under = RandomUnderSampler(random_state=SEED, sampling_strategy=target)
rna_smol, meta_smol = under.fit_resample(rna_train, meta_train["cell_type"])

# Setting the target so that cell_types with fewer than 200 samples are oversampled to 200
for key in target:
    target[key] = 200
smote = SMOTE(random_state=SEED, sampling_strategy=target)

rna_smol, meta_smol = smote.fit_resample(rna_smol, meta_smol)

In [None]:
# Remake the labels and tr_idx and te_idx (just use range() function)
# Combine rna_smol and rna_test

In [19]:
data_list = [rna]
labels = np.array(labels)
indices = (tr_idx, te_idx)

GCN_names = ["RNA"]
COMBINER = False

data = process(data_list, labels, indices, SEED=SEED)

SEED =  42


In [None]:
loss_rna, metrics_rna, _, _ = train(data, label_dict=label_dict, GCN_names=GCN_names, COMBINER=COMBINER,
        SEED=SEED, num_epoch=num_epoch, test_interval=test_interval, lr=lr, weight_decay=weight_decay, 
        dropout=dropout, adj_parameter=adj_parameter, VERBOSE=VERBOSE,
        RUN_TITLE=RUN_TITLE, RUN_TITLE_SHORT=RUN_TITLE_SHORT,
        OUTPUT_FILES=OUTPUT_FILES, feature_extract=feature_extract)

#losses_tcga.to_csv("loss_rna.csv")
#metrics_tcga.to_csv("metric_rna.csv")

2021-05-22 23:21:24.464730 

BMNC CITE-Seq RNA Only
SEED =  42
