# CellCNN data generation

source: https://github.com/eiriniar/CellCnn

In this example, we use CellCnn to analyze a mass cytometry dataset acquired to characterize human natural killer (NK) cell diversity and associate NK cell subsets with genetic and environmental factors, namely prior Cytomegalovirus (CMV) infection [1]. This dataset comprises mass cytometry measurements of 36 markers, including 28 NK cell receptors, for PBMC samples of 20 donors with varying serology for CMV. 

We will train CellCnn to identify CMV seropositivity-associated cell populations within the **manually gated NK cell compartment**. To run this example, please download the [NK cell dataset](http://www.imsb.ethz.ch/research/claassen/Software/cellcnn.html) and place the decompressed folder in the cellCnn/examples directory.

[1] Horowitz, A. et al. Genetic and environmental determinants of human NK cell diversity revealed by mass cytometry. Sci. Transl. Med. 5 (2013).


In [None]:
#!pip install FlowIO

In [1]:
import os, sys, errno, glob
import numpy as np
import pandas as pd
import cellCNN_utils  
from cellCNN_utils import loadFCS, ftrans, mkdir_p, get_items, generate_data, generate_normalized_data
sys.path.append('/Users/sav/Desktop/POSEIDON/CellCnn-python3')
import cellCnn
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
# define input and output directories
WDIR = os.path.join(cellCnn.__path__[0], 'examples')
FCS_DATA_PATH = os.path.join(WDIR, 'NK_cell_dataset', 'gated_NK')


In [3]:
# look at the measured markers
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print(data_fcs.channels)
print(shape(data_fcs.channels))

['Time', 'Cell_length', 'CD3', 'Dead', '(La139)Dd', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2', 'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25', 'DNA1', 'DNA2']
(43,)


In [4]:
# select the relevant markers for further analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D',
           'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1',
           'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
nmark = len(markers)

In [5]:
# load the sample names and corresponding labels (0: CMV-, 1: CMV+), here from a CSV file
# prior CMV infection status is obtained from the original study (Horowitz et al. 2013)
csv_file = 'NK_fcs_samples_with_labels.csv'
fcs_info = np.array(pd.read_csv(csv_file, sep=','))
sample_ids = fcs_info[:, 0]
sample_labels = fcs_info[:, 1].astype(int)
print(sample_ids)
print((sample_labels))

['a_001_NK.fcs' 'a_002_NK.fcs' 'a_003_NK.fcs' 'a_004_NK.fcs'
 'a_005_NK.fcs' 'a_006_NK.fcs' 'a_007_NK.fcs' 'a_009_NK.fcs'
 'a_010_NK.fcs' 'a_011_NK.fcs' 'a_012_NK.fcs' 'a_1a_NK.fcs' 'a_2a_NK.fcs'
 'a_2b_NK.fcs' 'a_3a_NK.fcs' 'a_3b_NK.fcs' 'a_4a_NK.fcs' 'a_4b_NK.fcs'
 'a_5a_NK.fcs' 'a_5b_NK.fcs']
[1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 1 1 1]


In [6]:
# Here we randomly split the samples in training/test sets.

def train_test_split(train_idx1=[], train_idx2=[], test=True):
    # set random seed for reproducible results
    np.random.seed(12345)

    # cofactor for arcsinh transformation
    cofactor = 5
    
    # split the fcs files into training, validation and test set
    group1 = np.where(sample_labels == 0)[0]
    group2 = np.where(sample_labels == 1)[0]
    l1, l2 = len(group1), len(group2)
    ntrain_per_class = 7
    ntest_group1 = l1 - ntrain_per_class
    ntest_group2 = l2 - ntrain_per_class

    # get the sample indices
    train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
    test_idx1 = [i for i in group1 if i not in train_idx1]
    train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
    test_idx2 = [i for i in group2 if i not in train_idx2]
    
    print(test_idx1)
    print(test_idx2)
    print(train_idx1)
    print(train_idx2)
    
    # load the training samples
    group1_list, group2_list = [], []
    for idx in train_idx1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group1_list.append(x)

    for idx in train_idx2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group2_list.append(x)

    # load the test samples
    t_group1_list, t_group2_list = [], []
    test_phenotypes = []
    for idx in test_idx1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        t_group1_list.append(x)
        test_phenotypes.append(0)

    for idx in test_idx2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        t_group2_list.append(x)
        test_phenotypes.append(1)

    # finally prepare training data
    cut = int(1 * len(group1_list))
    train_samples = group1_list[:cut] + group2_list[:cut]
    train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
    valid_samples = group1_list[cut:] + group2_list[cut:]
    valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
    test_samples = t_group1_list + t_group2_list

    return train_samples,train_phenotypes,test_samples,test_phenotypes

### Generate original data (not normalized)

In [7]:
from sklearn.utils import shuffle
train_samples, train_phenotypes, test_samples, test_phenotypes = train_test_split()
scaler = generate_data(train_samples, train_phenotypes, 'original/', valid_samples=test_samples, valid_phenotypes=test_phenotypes, ncell=200, nsubset=1000, verbose=0)

#generate also the test set on full ncell per sample:
def generate_for_pheno_prediction(new_samples,phenotypes,scaler):
        ncell_per_sample = np.min([x.shape[0] for x in new_samples])
        print(f"Predictions based on multi-cell inputs containing {ncell_per_sample} cells.")
        nmark = len(new_samples[0][1])
        # z-transform the new samples if we did that for the training samples
        if scaler is not None:
            new_samples = [scaler.transform(x) for x in new_samples]
        new_samples = [shuffle(x)[:ncell_per_sample].reshape(1, ncell_per_sample, nmark)
                           for x in new_samples]
        data_test = np.vstack(new_samples)
        mkdir_p('original/X_test_all/')
        for i in range(len(data_test)):
            np.savetxt('original/' + 'X_test_all/' + str(i) +'.txt', (transpose(data_test[i])))
        np.savetxt('original/' + 'y_test_all.txt', phenotypes)
        return data_test

data_test=generate_for_pheno_prediction(test_samples,test_phenotypes,scaler)
print(shape(data_test))

[3, 5, 8, 9]
[1, 6]
[12, 2, 7, 11, 15, 14, 13]
[16, 0, 4, 10, 18, 19, 17]
scale
Generating multi-cell inputs...
Done.
Predictions based on multi-cell inputs containing 5652 cells.
not none
(6, 5652, 37)


### Generate normalized data

In [None]:
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
train_samples, train_phenotypes, test_samples, test_phenotypes = train_test_split()
generate_normalized_data(train_samples, train_phenotypes, 'normalized/', valid_samples=test_samples, valid_phenotypes=test_phenotypes, ncell=200, nsubset=1000)

#generate also the test set on full ncell per sample:
def generate_for_pheno_prediction(new_samples,phenotypes):
        ncell_per_sample = np.min([x.shape[0] for x in new_samples])
        print(f"Predictions based on multi-cell inputs containing {ncell_per_sample} cells.")
        nmark = len(new_samples[0][1])

        new_samples = [shuffle(x)[:ncell_per_sample].reshape(1, ncell_per_sample, nmark)
                           for x in new_samples]
        data_test = np.vstack(new_samples)
        mkdir_p('normalized/X_test_all/')
        for i in range(len(data_test)):
            np.savetxt('normalized/' + 'X_test_all/' + str(i) +'.txt', normalize(transpose(data_test[i])))
        np.savetxt('normalized/' + 'y_test_all.txt', phenotypes)
        return data_test

data_test=generate_for_pheno_prediction(test_samples,test_phenotypes)
print(shape(data_test))

### Generate  data split between 5 parties

In [9]:
# Here we randomly split the samples in training/test sets.
nhosts=5
cofactor = 5

# first seperate the test set
group1 = np.where(sample_labels == 0)[0]
group2 = np.where(sample_labels == 1)[0]
l1, l2 = len(group1), len(group2)
ntrain_per_class = 7
ntest_group1 = l1 - ntrain_per_class
ntest_group2 = l2 - ntrain_per_class

# get the sample indices
train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
test_idx1 = [i for i in group1 if i not in train_idx1]
train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
test_idx2 = [i for i in group2 if i not in train_idx2]

print("Test set indices:")
print(test_idx1)
print(test_idx2)
print("Global train set indices:")
print(train_idx1)
print(train_idx2)

#distribute train indices balanced among n hosts:
split_idx_1 = []
split_idx_2 = []
group1_list = np.flip(np.array_split(numpy.array(train_idx1), nhosts))
group2_list = numpy.array_split(numpy.array(train_idx2), nhosts)

for i in range(nhosts):
    split_idx_1.append(group1_list[i].tolist())
    split_idx_2.append(group2_list[i].tolist())

print("Global train splitted among hosts - indices:")
print(split_idx_1)
print(split_idx_2)


for i in range(nhosts):
    print("\nHost no.", i, ":")
    folder_path = 'split/host' + str(i) + '/'
    host_idx_1 = split_idx_1[i]
    host_idx_2 = split_idx_2[i]
    print("host_idx_1:", host_idx_1, "- host_idx_2:", host_idx_2)
     # load the training samples
    host_group1_list, host_group2_list = [], []
    train_samples,train_phenotypes = [],[]
    for idx in host_idx_1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        host_group1_list.append(x)

    for idx in host_idx_2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        host_group2_list.append(x)
    # finally prepare training and vallidation data
    cut = int(1 * len(group1_list))
    train_samples = host_group1_list[:cut] + host_group2_list[:cut]
    train_phenotypes = [0] * len(host_group1_list[:cut]) + [1] * len(host_group2_list[:cut])
    print(train_phenotypes)
    generate_data(train_samples, train_phenotypes, folder_path, ncell=200, nsubset=200, verbose=0,generate_valid_set=False)
     

Test set indices:
[3, 7, 11, 14]
[6, 19]
Global train set indices:
[13, 15, 8, 9, 12, 5, 2]
[17, 10, 0, 18, 4, 16, 1]
Global train splitted among hosts - indices:
[[2], [5], [12], [8, 9], [13, 15]]
[[17, 10], [0, 18], [4], [16], [1]]

Host no. 0 :
host_idx_1: [2] - host_idx_2: [17, 10]


  return array(a, dtype, copy=False, order=order)


[0, 1, 1]
scale
Generating multi-cell inputs...
Done.

Host no. 1 :
host_idx_1: [5] - host_idx_2: [0, 18]
[0, 1, 1]
scale
Generating multi-cell inputs...
Done.

Host no. 2 :
host_idx_1: [12] - host_idx_2: [4]
[0, 1]
scale
Generating multi-cell inputs...
Done.

Host no. 3 :
host_idx_1: [8, 9] - host_idx_2: [16]
[0, 0, 1]
scale
Generating multi-cell inputs...
Done.

Host no. 4 :
host_idx_1: [13, 15] - host_idx_2: [1]
[0, 0, 1]
scale
Generating multi-cell inputs...
Done.
