# CellCNN data generation

source: https://github.com/eiriniar/CellCnn

In this example, we use CellCnn to analyze a mass cytometry dataset acquired to characterize human natural killer (NK) cell diversity and associate NK cell subsets with genetic and environmental factors, namely prior Cytomegalovirus (CMV) infection [1]. This dataset comprises mass cytometry measurements of 36 markers, including 28 NK cell receptors, for PBMC samples of 20 donors with varying serology for CMV. 

We will train CellCnn to identify CMV seropositivity-associated cell populations within the **manually gated NK cell compartment**. To run this example, please download the [NK cell dataset](http://www.imsb.ethz.ch/research/claassen/Software/cellcnn.html) and place the decompressed folder in the cellCnn/examples directory.

[1] Horowitz, A. et al. Genetic and environmental determinants of human NK cell diversity revealed by mass cytometry. Sci. Transl. Med. 5 (2013).


In [20]:
#!pip install FlowIO

In [1]:
import os, sys, errno, glob
import numpy as np
import pandas as pd
import cellCNN_utils  
from cellCNN_utils import loadFCS, ftrans, mkdir_p, get_items, generate_data, generate_normalized_data

%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
# define input and output directories
WDIR = ''
FCS_DATA_PATH = os.path.join(WDIR, 'gated_NK')

# define output directory
OUTDIR = os.path.join(WDIR, 'output_NK')
mkdir_p(OUTDIR)
print(FCS_DATA_PATH)

gated_NK


In [3]:
# look at the measured markers
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print(data_fcs.channels)
print(shape(data_fcs.channels))

['Time', 'Cell_length', 'CD3', 'Dead', '(La139)Dd', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2', 'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25', 'DNA1', 'DNA2']
(43,)


In [4]:
# select the relevant markers for further analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D',
           'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1',
           'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
nmark = len(markers)

In [5]:
# load the sample names and corresponding labels (0: CMV-, 1: CMV+), here from a CSV file
# prior CMV infection status is obtained from the original study (Horowitz et al. 2013)
csv_file = 'NK_fcs_samples_with_labels.csv'
fcs_info = np.array(pd.read_csv(csv_file, sep=','))
sample_ids = fcs_info[:, 0]
sample_labels = fcs_info[:, 1].astype(int)
print(sample_ids)
print(len(sample_labels))

['a_001_NK.fcs' 'a_002_NK.fcs' 'a_003_NK.fcs' 'a_004_NK.fcs'
 'a_005_NK.fcs' 'a_006_NK.fcs' 'a_007_NK.fcs' 'a_009_NK.fcs'
 'a_010_NK.fcs' 'a_011_NK.fcs' 'a_012_NK.fcs' 'a_1a_NK.fcs' 'a_2a_NK.fcs'
 'a_2b_NK.fcs' 'a_3a_NK.fcs' 'a_3b_NK.fcs' 'a_4a_NK.fcs' 'a_4b_NK.fcs'
 'a_5a_NK.fcs' 'a_5b_NK.fcs']
20


In [6]:
# Here we randomly split the samples in training/validation/test sets.

def train_valid_split(train_idx1=[], train_idx2=[], valid=True):
    # set random seed for reproducible results
    np.random.seed(12345)

    # cofactor for arcsinh transformation
    cofactor = 5

    # split the fcs files into training, validation and test set
    group1 = np.where(sample_labels == 0)[0]
    group2 = np.where(sample_labels == 1)[0]
    l1, l2 = len(group1), len(group2)

    # get the sample indices
    if len(train_idx1) == 0:
        train_idx1 = list(range(len(group1)))
    if len(train_idx2) == 0:
        train_idx2 = list(range(len(group2)))

    # load the training samples
    group1_list, group2_list = [], []
    for idx in train_idx1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group1_list.append(x)

    for idx in train_idx2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group2_list.append(x)

    # finally prepare training and validation data
    if valid:
        cut = int(.2 * len(group1_list))
    else:
        cut = 0
    train_samples = group1_list[cut:] + group2_list[cut:]
    train_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
    valid_samples = group1_list[:cut] + group2_list[:cut]
    valid_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])

    return train_samples, train_phenotypes, valid_samples, valid_phenotypes

### Generate original data (not normalized)

In [11]:
train_samples, train_phenotypes, valid_samples, valid_phenotypes = train_valid_split()
generate_data(train_samples, train_phenotypes, 'original/', valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, ncell=200, nsubset=1000, verbose=0)

Generating multi-cell inputs...
2000 ; 37 ; 200
2000
Done.


### Generate normalized data

In [13]:
train_samples, train_phenotypes, valid_samples, valid_phenotypes = train_valid_split()
generate_normalized_data(train_samples, train_phenotypes, 'normalized/', valid_samples=valid_samples, valid_phenotypes=valid_phenotypes, ncell=200, nsubset=1000, verbose=0)

Generating multi-cell inputs...
Done.


### Generate normalized data split between 5 parties

In [14]:
nhosts = 5

group1 = np.where(sample_labels == 0)[0]
group2 = np.where(sample_labels == 1)[0]
print(len(group1), len(group2))

split_idx_1 = []
split_idx_2 = []
group1_list = np.flip(np.array_split(numpy.array(group1), nhosts))
group2_list = numpy.array_split(numpy.array(group2), nhosts)

for i in range(nhosts):
    split_idx_1.append(group1_list[i].tolist())
    split_idx_2.append(group2_list[i].tolist())

print(split_idx_1)
print(split_idx_2)

11 9
[[14, 15], [12, 13], [9, 11], [7, 8], [2, 3, 5]]
[[0, 1], [4, 6], [10, 16], [17, 18], [19]]


In [12]:
for i in range(nhosts):
    print("\nHost no.", i, ":")
    folder_path = 'split-normalized/host' + str(i) + '/'
    host_idx_1 = split_idx_1[i]
    host_idx_2 = split_idx_2[i]
    print("host_idx_1:", host_idx_1, "- host_idx_2:", host_idx_2)
    train_samples, train_phenotypes, _, _ = train_valid_split(host_idx_1, host_idx_2)
    generate_normalized_data(train_samples, train_phenotypes, folder_path, ncell=200, nsubset=1000, verbose=0)


Host no. 0 :
host_idx_1: [14, 15] - host_idx_2: [0, 1]
Generating multi-cell inputs...
Done.

Host no. 1 :
host_idx_1: [12, 13] - host_idx_2: [4, 6]
Generating multi-cell inputs...
Done.

Host no. 2 :
host_idx_1: [9, 11] - host_idx_2: [10, 16]
Generating multi-cell inputs...
Done.

Host no. 3 :
host_idx_1: [7, 8] - host_idx_2: [17, 18]
Generating multi-cell inputs...
Done.

Host no. 4 :
host_idx_1: [2, 3, 5] - host_idx_2: [19]
Generating multi-cell inputs...
Done.
