# CellCnn [1] data generation

source: https://github.com/eiriniar/CellCnn

""" Copyright 2016-2017 ETH Zurich, Eirini Arvaniti and Manfred Claassen.

This module contains data preprocessing/distribution functions.

"""
The code is slightly changed depending on original implementation to make it compatible with decentralized settings



In this example, we preprocess and distribute a mass cytometry dataset to characterize Cytomegalovirus (CMV) infection [2].

The dataset comprises mass cytometry measurements of 36 markers, including 28 NK cell receptors, for PBMC samples of 20 donors with varying serology for CMV. 

The example depends on **manually gated NK cell compartment**. To run this example, 

    - download the [NK cell dataset] at https://imsb.ethz.ch/research/claassen/Software/cellcnn.html,
    - uncompress and place it in the data/cellCNN/ folder

Data distribution: We fix the test set for all experimental settings, the training dataset is then generated by distribution different donors for each institution depending on number of institutions.

[1] E. Arvaniti and M. Claassen. Sensitive detection of rare disease-associated cell subsets via representation learning.Nat Commun, 8:1–10, 2017
[2] Horowitz, A. et al. Genetic and environmental determinants of human NK cell diversity revealed by mass cytometry. Sci. Transl. Med. 5 (2013).

In [50]:
#!pip install FlowIO

In [51]:
import os, sys, errno, glob
import tensorflow as tf
import numpy as np
import pandas as pd
import cellCNN_utils  
from cellCNN_utils import loadFCS, ftrans, mkdir_p, get_items, generate_data, generate_normalized_data

from pathlib import Path
d = Path().resolve()
sys.path.append(d)
%pylab inline


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [52]:
# define input and output directories
FCS_DATA_PATH = os.path.join(d, 'NK_cell_dataset', 'gated_NK')


In [53]:
# list of measured markers in the dataset:
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/*.fcs')[0], transform=None, auto_comp=False)
print(data_fcs.channels)
print(shape(data_fcs.channels))

['Time', 'Cell_length', 'CD3', 'Dead', '(La139)Dd', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2', 'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D', 'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1', 'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25', 'DNA1', 'DNA2']
(43,)


In [54]:
# select the relevant markers for data generation, same markers originally used in cellCnn analysis
markers = ['CD3', 'CD27', 'CD19', 'CD4', 'CD8', 'CD57', '2DL1-S1', 'TRAIL', '2DL2-L3-S2',
           'CD16', 'CD10', '3DL1-S1', 'CD117', '2DS4', 'ILT2-CD85j', 'NKp46', 'NKG2D',
           'NKG2C', '2B4', 'CD33', 'CD11b', 'NKp30', 'CD122', '3DL1', 'NKp44', 'CD127', '2DL1',
           'CD94', 'CD34', 'CCR7', '2DL3', 'NKG2A', 'HLA-DR', '2DL4', 'CD56', '2DL5', 'CD25']
marker_idx = [data_fcs.channels.index(label) for label in markers]
print(marker_idx)
nmark = len(markers)

[2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]


In [55]:
# load the sample names and corresponding labels (0: CMV-, 1: CMV+), here from a CSV file
# prior CMV infection status is obtained from the original study (Horowitz et al. 2013)
csv_file = 'NK_fcs_samples_with_labels.csv'
fcs_info = np.array(pd.read_csv(csv_file, sep=','))
sample_ids = fcs_info[:, 0]
sample_labels = fcs_info[:, 1].astype(int)
print(sample_ids)

['a_001_NK.fcs' 'a_002_NK.fcs' 'a_003_NK.fcs' 'a_004_NK.fcs'
 'a_005_NK.fcs' 'a_006_NK.fcs' 'a_007_NK.fcs' 'a_009_NK.fcs'
 'a_010_NK.fcs' 'a_011_NK.fcs' 'a_012_NK.fcs' 'a_1a_NK.fcs' 'a_2a_NK.fcs'
 'a_2b_NK.fcs' 'a_3a_NK.fcs' 'a_3b_NK.fcs' 'a_4a_NK.fcs' 'a_4b_NK.fcs'
 'a_5a_NK.fcs' 'a_5b_NK.fcs']


In [123]:
# Here we randomly split the samples in training/test sets.

def train_test_split(train_idx1=[], train_idx2=[], test=True):
    # set random seed for reproducible results
    np.random.seed(12345)

    # cofactor for arcsinh transformation
    cofactor = 5
    
    # split the fcs files into training, validation and test set (note that secure-protocols do not use validation sets)
    group1 = np.where(sample_labels == 0)[0]
    group2 = np.where(sample_labels == 1)[0]
    l1, l2 = len(group1), len(group2)
    ntrain_per_class = 7
    ntest_group1 = l1 - ntrain_per_class
    ntest_group2 = l2 - ntrain_per_class

    # get the sample indices
    train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
    test_idx1 = [i for i in group1 if i not in train_idx1]
    train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
    test_idx2 = [i for i in group2 if i not in train_idx2]
    
    print("test indices")
    test_indices = [test_idx1,test_idx2]
    print(test_indices)

    print("train indices")

    
    print(train_idx1)
    print(train_idx2)
    train_indices = [train_idx1,train_idx2]

    # load the training samples
    group1_list, group2_list = [], []
    for idx in train_idx1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group1_list.append(x)

    for idx in train_idx2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        group2_list.append(x)

    # load the test samples
    t_group1_list, t_group2_list = [], []
    test_phenotypes = []
    for idx in test_idx1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        t_group1_list.append(x)
        test_phenotypes.append(0)

    for idx in test_idx2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        t_group2_list.append(x)
        test_phenotypes.append(1)

    # finally prepare training data
    cut = int(1 * len(group1_list))
    train_samples = group1_list[:cut] + group2_list[:cut]
    train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
    valid_samples = group1_list[cut:] + group2_list[cut:]
    valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
    test_samples = t_group1_list + t_group2_list

    return train_samples,train_phenotypes,test_samples,test_phenotypes, test_indices,train_indices

### Generate original data (with transform)
In the following, 
- We generate training data with $ncell=200$ cells per sample and $nsubset=1000$ samples per class
- We generate the test data for $ncell=200$ cells per sample from test indices, called X_test
- We generate another test set 'per-individual' in test indices using maximum number of cells to use for phenotype prediction, called X_test_all

Processed data is placed under originalNK/ folder

The script prints the max number of cells for the current example (i.e., 5652 for this dataset) which then will be used as a parameter in the golang protocol.

In [124]:
from sklearn.utils import shuffle
train_samples, train_phenotypes, test_samples, test_phenotypes, test_indices, train_indices = train_test_split()

scaler,x_tr,y_tr,x_test,y_test = generate_data(train_samples, train_phenotypes, 'originalNK/', valid_samples=test_samples, valid_phenotypes=test_phenotypes, 
                                               ncell=200, nsubset=1000, per_sample=False, verbose=0, saveFile=False)

#generate also the test set on full max-ncell per sample:
def generate_for_pheno_prediction(new_samples,phenotypes,scaler):
        ncell_per_sample = np.min([x.shape[0] for x in new_samples])
        print(f"Predictions based on multi-cell inputs containing {ncell_per_sample} cells.")
        nmark = len(new_samples[0][1])
        # z-transform the new samples if we did that for the training samples
        if scaler is not None:
            new_samples = [scaler.transform(x) for x in new_samples]
        new_samples = [shuffle(x)[:ncell_per_sample].reshape(1, ncell_per_sample, nmark)
                           for x in new_samples]
        data_test = np.vstack(new_samples)
#         mkdir_p('originalNK/X_test_all/')
#         for i in range(len(data_test)):
#             np.savetxt('originalNK/' + 'X_test_all/' + str(i) +'.txt', (transpose(data_test[i])))
#         np.savetxt('originalNK/' + 'y_test_all.txt', phenotypes)
        return data_test,phenotypes

data_test,phenotypes=generate_for_pheno_prediction(test_samples,test_phenotypes,scaler)
print(shape(data_test))

test indices
[[3, 5, 8, 9], [1, 6]]
train indices
[12, 2, 7, 11, 15, 14, 13]
[16, 0, 4, 10, 18, 19, 17]
new scale
Generating multi-cell inputs...
Predictions based on multi-cell inputs containing 5652 cells.
(6, 5652, 37)


### Generate normalized data
In the following, 
- We generate normalized training data with $ncell=200$ cells per sample and $nsubset=1000$ samples per class
- We generate the test data for $ncell=200$ cells per sample from test indices, called X_test
- We generate another test set 'per-individual' in test indices using maximum number of cells to use for phenotype prediction, called X_test_all

Processed data is placed under normalizedNK/ folder

The script prints the max number of cells for the current example (i.e., 5652 for this dataset) which then will be used as a parameter in the golang protocol.

In [None]:
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
train_samples, train_phenotypes, test_samples, test_phenotypes, test_indices, train_indices = train_test_split()
generate_normalized_data(train_samples, train_phenotypes, 'normalizedNK/', valid_samples=test_samples, valid_phenotypes=test_phenotypes, ncell=200, nsubset=1000)

#generate also the test set on full ncell per sample:
def generate_for_pheno_prediction(new_samples,phenotypes):
        ncell_per_sample = np.min([x.shape[0] for x in new_samples])
        print(f"Predictions based on multi-cell inputs containing {ncell_per_sample} cells.")
        nmark = len(new_samples[0][1])

        new_samples = [shuffle(x)[:ncell_per_sample].reshape(1, ncell_per_sample, nmark)
                           for x in new_samples]
        data_test = np.vstack(new_samples)
        mkdir_p('normalizedNK/X_test_all/')
        for i in range(len(data_test)):
            np.savetxt('normalizedNK/' + 'X_test_all/' + str(i) +'.txt', normalize(transpose(data_test[i])))
        np.savetxt('normalizedNK/' + 'y_test_all.txt', phenotypes)
        return data_test

data_test=generate_for_pheno_prediction(test_samples,test_phenotypes)
print(shape(data_test))
print(shape(test_samples))

### Generate  data split between $nhosts$ parties
In the following, 
- We generate training data with $ncell=200$ cells per sample and $nsubset=1000$ samples per class, per party
- Example below distributes the train indices per donor for 3 parties

Processed data is placed under splitNK/host_i for party-i


In [125]:
# Here we randomly split the samples in training/test sets.
nhosts=2
cofactor = 5
test_idx1 = test_indices[0]
test_idx2 = test_indices[1]

train_idx1 = train_indices[0]
train_idx2 = train_indices[1]


print("Test set indices:")
print(test_idx1)
print(test_idx2)
print("Global train set indices:")
print(train_idx1)
print(train_idx2)

#distribute train indices balanced among n hosts:
split_idx_1 = []
split_idx_2 = []
group1_list = np.flip(np.array_split(numpy.array(train_idx1), nhosts))
group2_list = numpy.array_split(numpy.array(train_idx2), nhosts)

for i in range(nhosts):
    split_idx_1.append(group1_list[i].tolist())
    split_idx_2.append(group2_list[i].tolist())

print("Global train splitted among hosts - indices:")
print(split_idx_1)
print(split_idx_2)


for i in range(nhosts):
    print("\nHost no.", i, ":")
    folder_path = 'splitNK2/host' + str(i) + '/'
    host_idx_1 = split_idx_1[i]
    host_idx_2 = split_idx_2[i]
    print("host_idx_1:", host_idx_1, "- host_idx_2:", host_idx_2)
     # load the training samples
    host_group1_list, host_group2_list = [], []
    train_samples,train_phenotypes = [],[]
    for idx in host_idx_1:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        host_group1_list.append(x)

    for idx in host_idx_2:
        fname = os.path.join(FCS_DATA_PATH, sample_ids[idx])
        x_full = np.asarray(loadFCS(fname, transform=None, auto_comp=False))
        x = ftrans(x_full[:,marker_idx], cofactor)
        host_group2_list.append(x)
    # finally prepare training and vallidation data
    cut = int(1 * len(group1_list))
    train_samples = host_group1_list[:cut] + host_group2_list[:cut]
    train_phenotypes = [0] * len(host_group1_list[:cut]) + [1] * len(host_group2_list[:cut])
    print(train_phenotypes)
    #balance the generation per party
    if(len(train_phenotypes)<ceil(14/nhosts)):
        #increase nsubset gradually to balance the local distribution 
        generate_data(train_samples, train_phenotypes, folder_path, ncell=200, nsubset=500,per_sample=False, verbose=0,generate_valid_set=False)
    else:
        generate_data(train_samples, train_phenotypes, folder_path, ncell=200, nsubset=500,per_sample=False, verbose=0,generate_valid_set=False)


     

Test set indices:
[3, 5, 8, 9]
[1, 6]
Global train set indices:
[12, 2, 7, 11, 15, 14, 13]
[16, 0, 4, 10, 18, 19, 17]
Global train splitted among hosts - indices:
[[15, 14, 13], [12, 2, 7, 11]]
[[16, 0, 4, 10], [18, 19, 17]]

Host no. 0 :
host_idx_1: [15, 14, 13] - host_idx_2: [16, 0, 4, 10]


  return array(a, dtype, copy=False, order=order)


[0, 0, 1, 1]
new scale
Generating multi-cell inputs...
Done.

Host no. 1 :
host_idx_1: [12, 2, 7, 11] - host_idx_2: [18, 19, 17]
[0, 0, 1, 1]
new scale
Generating multi-cell inputs...
Done.


The code below is to reproduce the CellCnn accuracies with the current generation of data (local, centralized etc) to reproduce the classification metrics with original architecture

In [144]:
#The reproduction of original CellCnn model training without validation test and further analysis part
#This part is used for the comparison of accuracy/precision/recall/f-score of CellCnn with secure distributed version
#test on 200-cell multi-instances and full test set phenotype prediction
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import layers, initializers, regularizers, optimizers, callbacks
from keras import backend as K

#repeat cellcnn original training on full train data

k = 200
ncell = 200
nfilter = 8
y_tr_n = to_categorical(y_tr)

def pool_top_k(x, k):
    return tf.reduce_mean(tf.sort(x, axis=1, direction='DESCENDING')[:, :k, :], axis=1)
def create_model (k,ncell,nfilter):
        
        data_input = keras.Input(shape=(ncell, 37))
        coeff_l1=0
        coeff_l2=1e-4
        n_classes=2
        # the filters
        conv = layers.Conv1D(filters=nfilter,
                             kernel_size=1,
                             activation='relu',
                             kernel_initializer=initializers.RandomUniform(),
                             kernel_regularizer=regularizers.l1_l2(l1=coeff_l1, l2=coeff_l2),
                             name='conv1')(data_input)

        # the cell grouping part (top-k pooling)
        pooled = layers.Lambda(pool_top_k, output_shape=(nfilter,), arguments={'k': k})(conv)
        output = layers.Dense(units=n_classes,
                                  activation='softmax',
                                  kernel_initializer=initializers.RandomUniform(),
                                  kernel_regularizer=regularizers.l1_l2(l1=coeff_l1, l2=coeff_l2),
                                  name='output')(pooled)
        model = keras.Model(inputs=data_input, outputs=output)

        model.compile(optimizer=optimizers.Adam(learning_rate=0.04),
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
        return model
model = create_model(k,ncell,nfilter)
#generate data

x_tr_n = x_tr.transpose(0,2,1)
# Fit data to model
print(type(x_tr_n))
print(len(x_tr_n))
print(len(x_tr_n[0]))
print(len(x_tr_n[1][0]))


history = model.fit(x_tr_n, y_tr_n,
            batch_size=200,
            epochs=20,
            verbose=2,
            validation_split=0)

<class 'numpy.ndarray'>
1988
200
37
Epoch 1/20
10/10 - 1s - loss: 0.5150 - accuracy: 0.8023
Epoch 2/20
10/10 - 0s - loss: 0.0852 - accuracy: 0.9960
Epoch 3/20
10/10 - 0s - loss: 0.0140 - accuracy: 1.0000
Epoch 4/20
10/10 - 0s - loss: 0.0124 - accuracy: 1.0000
Epoch 5/20
10/10 - 0s - loss: 0.0126 - accuracy: 1.0000
Epoch 6/20
10/10 - 0s - loss: 0.0121 - accuracy: 1.0000
Epoch 7/20
10/10 - 0s - loss: 0.0114 - accuracy: 1.0000
Epoch 8/20
10/10 - 0s - loss: 0.0106 - accuracy: 1.0000
Epoch 9/20
10/10 - 0s - loss: 0.0099 - accuracy: 1.0000
Epoch 10/20
10/10 - 0s - loss: 0.0093 - accuracy: 1.0000
Epoch 11/20
10/10 - 0s - loss: 0.0087 - accuracy: 1.0000
Epoch 12/20
10/10 - 0s - loss: 0.0082 - accuracy: 1.0000
Epoch 13/20
10/10 - 0s - loss: 0.0079 - accuracy: 1.0000
Epoch 14/20
10/10 - 0s - loss: 0.0076 - accuracy: 1.0000
Epoch 15/20
10/10 - 0s - loss: 0.0073 - accuracy: 1.0000
Epoch 16/20
10/10 - 0s - loss: 0.0071 - accuracy: 1.0000
Epoch 17/20
10/10 - 0s - loss: 0.0069 - accuracy: 1.0000
Epoc

In [145]:
#For 200-cell predictions on test set
def model_pred(prob):
    pred = []
    for p in prob:
        if p[0]>p[1]:
            pred.append(0)
        else:
            pred.append(1)
    return pred

x_test_n = x_test.transpose(0,2,1)
y_test_n = to_categorical(y_test)

loss, accuracy = model.evaluate(x_test_n, y_test_n, verbose=0)
#score = model.evaluate(x_test_n, y_test_n, verbose=0)
print("For 200-cell predictions on test set with size",x_test_n.shape)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')

y_pred = model.predict(x_test_n)
y_pred = model_pred(y_pred)


from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F-score:",f1_score(y_test, y_pred))
print("precision:",precision_score(y_test, y_pred))
print("recall:",recall_score(y_test, y_pred)) 



For 200-cell predictions on test set with size (2000, 200, 37)
Test loss: 0.8170528411865234, Test accuracy: 0.8500000238418579
Accuracy: 0.85
F-score: 0.8678414096916299
precision: 0.7755905511811023
recall: 0.985


In [146]:
#For phenotype predictions on test set using all cells 

model2 = create_model(5652, 5652,8)
weights = model.get_weights()
model2.set_weights(weights)
data_test_n = data_test.transpose(0,2,1)
phenotypes_n = to_categorical(phenotypes)

y_pred = model2.predict(data_test)

y_pred = model_pred(y_pred)

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(phenotypes, y_pred))
print("F-score:",f1_score(phenotypes, y_pred))
print("precision:",precision_score(phenotypes, y_pred))
print("recall:",recall_score(phenotypes, y_pred)) 


Accuracy: 0.8333333333333334
F-score: 0.8
precision: 0.6666666666666666
recall: 1.0
