## PBMC preprocessing

We follow the preprocessing code from scVI (https://github.com/romain-lopez/scVI-reproducibility/blob/master/CORTEX-prepro.ipynb). 

Before running this notebook, users need to download the data by following scVI (https://github.com/romain-lopez/scVI-reproducibility/blob/master/CORTEX-prepro.ipynb). 

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import scipy.sparse
import scipy.io
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

## Load expression data

In [8]:
expression = pd.read_csv('/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/expression.txt', sep=" ")

In [9]:
expression.head()

Unnamed: 0,ENSG00000279457,ENSG00000228463,ENSG00000237491,ENSG00000230368,ENSG00000188976,ENSG00000188290,ENSG00000187608,ENSG00000186891,ENSG00000186827,ENSG00000078808,...,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,ENSG00000198886,ENSG00000198786,ENSG00000198695,ENSG00000198727,ENSG00000273748,ENSG00000278817
0,0,0,0,0,0,0,1,0,4,0,...,1,14,6,1,8,8,0,5,0,0
1,0,0,0,0,0,0,0,0,0,1,...,3,6,6,0,4,2,1,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,10,23,19,0,19,8,0,7,0,0
3,0,0,0,0,1,0,0,0,0,1,...,16,22,16,2,15,2,0,6,0,0
4,0,0,0,0,0,0,0,0,0,0,...,10,13,8,0,17,6,0,8,0,0


In [11]:
expression.shape

(12039, 10310)

## Load selected gene

In [10]:
micro_array_result = pd.read_csv('/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/gene_info.csv')

In [13]:
micro_array_result.head()

Unnamed: 0.1,Unnamed: 0,ENSG,GS,CD_logFC,CD_AveExpr,CD_t,CD_P.Value,CD_adj.P.Val,CD_B,BDC_logFC,...,BDC_t,BDC_P.Value,BDC_adj.P.Val,BDC_B,BDC2_logFC,BDC2_AveExpr,BDC2_t,BDC2_P.Value,BDC2_adj.P.Val,BDC2_B
0,5,ENSG00000188976,NOC2L,0.248976,4.19323,1.308282,0.201059,0.478547,-5.465606,0.023141,...,0.199196,0.844018,0.936848,-6.964569,0.532702,5.985696,1.671825,0.116222,0.148453,-6.166109
1,7,ENSG00000187608,ISG15,-0.036463,5.769094,-0.21699,0.829737,0.919775,-6.27281,0.987879,...,3.449654,0.002391,0.019573,-2.095965,0.933572,6.303954,2.202136,0.044485,0.062851,-5.300666
2,36,ENSG00000149527,PLCH2,0.5056,3.225463,1.834575,0.076854,0.327638,-4.707168,-0.119499,...,-1.014781,0.321715,0.598441,-6.463128,-0.977612,4.232389,-3.119639,0.007349,0.012455,-3.569136
3,37,ENSG00000157881,PANK4,-0.093142,4.319701,-0.772409,0.446126,0.686981,-6.001137,0.024259,...,0.111785,0.912052,0.963969,-6.978668,0.183735,6.453772,1.306024,0.2121,0.252955,-6.668252
4,40,ENSG00000157873,TNFRSF14,0.073292,4.949553,0.353608,0.726192,0.865967,-6.233949,-0.377352,...,-2.465162,0.022361,0.106855,-4.215261,-0.075492,6.920026,-0.540324,0.597263,0.639609,-7.373536


In [12]:
micro_array_result.shape

(3346, 21)

## Merged expression data

In [14]:
expression = expression[micro_array_result["ENSG"]]

In [15]:
expression.head()

Unnamed: 0,ENSG00000188976,ENSG00000187608,ENSG00000149527,ENSG00000157881,ENSG00000157873,ENSG00000130764,ENSG00000198912,ENSG00000162408,ENSG00000204859,ENSG00000162413,...,ENSG00000160193,ENSG00000160201,ENSG00000160213,ENSG00000160216,ENSG00000160224,ENSG00000183255,ENSG00000160255,ENSG00000160299,ENSG00000160305,ENSG00000160307
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,4,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,3,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,3,0,0,0


In [16]:
expression.shape

(12039, 3346)

## Load bio info

In [17]:
data_path = "/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/"

In [18]:
design = pd.read_csv(data_path + "design.csv", index_col=0)
dim_names = ["WPC"+str(i) for i in range(1, 8)]
qc_pc = design[dim_names].values
normalized_qc = pd.read_csv(data_path + "full_qc.csv", index_col=0)
barcodes = pd.read_csv(data_path + "barcodes.csv", index_col=1)
batch = pd.read_csv(data_path + "batch.csv", index_col=0)

In [19]:
raw_qc_8 = pd.read_csv("/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/molecule_qc_8k.txt", index_col=0, sep="\t")
raw_qc_8.index = raw_qc_8.index.map(lambda s: s+"-1")
raw_qc_4 = pd.read_csv("/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/molecule_qc_4k.txt", index_col=0, sep="\t")
raw_qc_8 = raw_qc_8.loc[barcodes.iloc[np.where(batch["x"] == "pbmc8k")[0], :].index.values]
number_suffix = barcodes.iloc[np.where(batch["x"] == "pbmc4k")[0], :].index.map(lambda s: str(s.split("-")[1])).values
barcode_prefix = barcodes.iloc[np.where(batch["x"] == "pbmc4k")[0], :].index.map(lambda s: str(s.split("-")[0])).values
raw_qc_4 = raw_qc_4.loc[barcode_prefix]
raw_qc_4.index = raw_qc_4.index +"-" +number_suffix

In [20]:
raw_qc = pd.concat((raw_qc_8, raw_qc_4)).loc[barcodes.index]
qc_pc.shape, normalized_qc.shape, raw_qc.shape, batch.shape

((12039, 7), (12039, 9), (12039, 9), (12039, 1))

In [21]:
bio = pd.read_csv(data_path + "bio.csv", index_col=0)
list_clusters = np.unique(bio["x"])
def string_to_cluster(string):
    return np.where(list_clusters == string)[0][0]
data_bio = bio["x"].apply(lambda i: string_to_cluster(i))
clusters = data_bio.values
data_bio.value_counts()

2    5024
1    2237
0    1625
3    1452
8     464
7     459
5     351
4     339
6      88
Name: x, dtype: int64

In [22]:
list_clusters # From 0 to 8

array(['B cells', 'CD14+ Monocytes', 'CD4 T cells', 'CD8 T cells',
       'Dendritic Cells', 'FCGR3A+ Monocytes', 'Megakaryocytes',
       'NK cells', 'Other'], dtype=object)

In [23]:
data_bio

1        2
2        2
3        1
4        1
5        3
        ..
12035    1
12036    3
12037    7
12038    0
12039    1
Name: x, Length: 12039, dtype: int64

In [24]:
_, batch = np.unique(batch, return_inverse=True)

## Separate training and testing data

In [25]:
expression_np = np.array(expression)

In [26]:
expression_np.shape

(12039, 3346)

In [27]:
X_train, X_test, \
c_train, c_test, \
r_train, r_test, \
b_train, b_test, \
qc_train, qc_test = \
                train_test_split(expression_np, clusters, qc_pc, \
                                 batch, raw_qc.values, random_state=0)

In [28]:
X_train.shape, X_test.shape

((9029, 3346), (3010, 3346))

In [29]:
c_train.shape, c_test.shape

((9029,), (3010,))

In [30]:
data_path = "/home/jzhaoaz/jiazhao/scPI_v2/package/datasets/pbmc/"
np.savetxt(data_path + "data_train", X_train)
np.savetxt(data_path + "data_test", X_test)

np.savetxt(data_path + "label_train", c_train)
np.savetxt(data_path + "label_test", c_test)

np.savetxt(data_path + "b_train", b_train)
np.savetxt(data_path + "b_test", b_test)

np.savetxt(data_path + "design_train", r_train)
np.savetxt(data_path + "design_test", r_test)

np.savetxt(data_path + "qc_train", qc_train)
np.savetxt(data_path + "qc_test", qc_test)