In [1]:
import os, sys, errno, glob
import tensorflow as tf
import numpy as np
import pandas as pd
import cellCNN_utils  
from cellCNN_utils import loadFCS, ftrans, mkdir_p, get_items, generate_data, generate_normalized_data
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
d = Path().resolve()
sys.path.append(d)
%pylab inline
# define input and output directories
FCS_DATA_PATH = os.path.join(d, 'FlowRepository')
# select the relevant markers for further analysis
data_fcs = loadFCS(glob.glob(FCS_DATA_PATH + '/discovery_cohort.fcs')[0], transform=None, auto_comp=False)
print(data_fcs.channels)
#-------------------------------SET PARAMS HERE -----------------------#

cg = 'RRMS' #Control group, set to NIND or RRMS depending on experimental setting
ncells = 100 #num cells per multi-cell input or multi-cell test data
ncell_test = 5000 #num cells for phenotype prediction
ntrain_all = 30000 # number of multi-cell inputs for the train data (in total)
ntest_all = 10000 # number of multi-cell inputs for the test data
nfilter = 8 #number of filters for the training

#-----------------------------------------------------------------------#

markers=['CCR2', 'CCR4', 'CCR6', 'CCR7', 'CXCR4', 'CXCR5', 'CD103', 'CD14', 'CD20', 
        'CD25', 'CD27', 'CD28', 'CD3', 'CD4', 'CD45RA', 'CD45RO', 'CD56', 'CD57', 'CD69', 'CD8', 
        'TCRgd', 'PD.1', 'GM.CSF', 'IFN.g', 'IL.10', 'IL.13', 'IL.17A', 'IL.2', 'IL.21', 'IL.22', 'IL.3',
        'IL.4', 'IL.6', 'IL.9', 'TNF.a']
len(markers)



Populating the interactive namespace from numpy and matplotlib
['CCR2', 'CCR4', 'CCR6', 'CCR7', 'CXCR4', 'CXCR5', 'CD103', 'CD14', 'CD20', 'CD25', 'CD27', 'CD28', 'CD3', 'CD4', 'CD45RA', 'CD45RO', 'CD56', 'CD57', 'CD69', 'CD8', 'TCRgd', 'PD.1', 'GM.CSF', 'IFN.g', 'IL.10', 'IL.13', 'IL.17A', 'IL.2', 'IL.21', 'IL.22', 'IL.3', 'IL.4', 'IL.6', 'IL.9', 'TNF.a', 'gate_source', 'manual_labels', 'labels', 'cell_id', 'cd4_labels', 'cd8_labels', 'run_0', 'run_1', 'run_2', 'intersection_3_runs']


35

In [2]:
#gate_source is 35.index
#gate_source is ind=1 in excell, label is ind=4
#match gate_source from table NINDC=0, RRMS=1
metadata= pd.read_excel(FCS_DATA_PATH+'/meta_data_discovery_cohort.xlsx')
metadata=metadata.to_numpy()
gate_source = metadata[:,1]
labelsTemp = metadata[:,4]
data = []
sample_labels =[]
for i in range(99):
    cur_gs = gate_source[i]
    cur_lab = labelsTemp[i]
    patient_sample = []
    if cur_lab == 'HD':
        gs_ind = np.where(data_fcs.events[:,35]==cur_gs)
        for j in gs_ind[0]:
            patient_sample.append(data_fcs.events[j,0:35])
        sample_labels.append(0)
    elif cur_lab == cg:
        gs_ind = np.where(data_fcs.events[:,35]==cur_gs)
        for j in gs_ind[0]:
            patient_sample.append(data_fcs.events[j,0:35])
        sample_labels.append(1)
    if len(patient_sample)>0:
        data.append(np.asarray(patient_sample))
sample_labels=np.asarray(sample_labels)


In [3]:
# Here we randomly split the samples in training/test sets.
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import tensorflow as tf
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import layers, initializers, regularizers, optimizers, callbacks
from keras import backend as K


def train_test_split(train_idx1=[], train_idx2=[], test=True):

    # split the fcs files into training, validation and test set (note that secure-protocols do not use validation sets)
    group1 = np.where(sample_labels == 0)[0]
    group2 = np.where(sample_labels == 1)[0]
    l1, l2 = len(group1), len(group2)
    ntrain_per_class = 24
    ntest_group1 = l1 - ntrain_per_class
    ntest_group2 = l2 - ntrain_per_class

    # get the sample indices
    train_idx1 = list(np.random.choice(group1, size=ntrain_per_class, replace=False))
    test_idx1 = [i for i in group1 if i not in train_idx1]
    train_idx2 = list(np.random.choice(group2, size=ntrain_per_class, replace=False))
    test_idx2 = [i for i in group2 if i not in train_idx2]

    print("test indices")
    test_indices = [test_idx1,test_idx2]
    print(test_indices) 
    print("train indices")
    print(train_idx1)
    print(train_idx2)

    train_indices = [train_idx1,train_idx2]

    # load the training samples
    group1_list, group2_list = [], []
    for idx in train_idx1:
        x = data[idx][:]
        group1_list.append(x)

    for idx in train_idx2:
        x = data[idx][:]
        group2_list.append(x)

    # load the test samples
    t_group1_list, t_group2_list = [], []
    test_phenotypes = []
    for idx in test_idx1:
        x = data[idx][:]
        t_group1_list.append(x)
        test_phenotypes.append(0)

    for idx in test_idx2:
        x = data[idx][:]
        t_group2_list.append(x)
        test_phenotypes.append(1)

    # finally prepare training data
    cut = int(1 * len(group1_list))
    train_samples = group1_list[:cut] + group2_list[:cut]
    train_phenotypes = [0] * len(group1_list[:cut]) + [1] * len(group2_list[:cut])
    valid_samples = group1_list[cut:] + group2_list[cut:]
    valid_phenotypes = [0] * len(group1_list[cut:]) + [1] * len(group2_list[cut:])
    test_samples = t_group1_list + t_group2_list
    print(test_phenotypes)
    return train_samples,train_phenotypes,test_samples,test_phenotypes, test_indices,train_indices

#generate also the test set on full min-ncell per sample:
def generate_for_pheno_prediction(new_samples,phenotypes,scaler):
        ncell_per_sample = np.min([x.shape[0] for x in new_samples])
        print(f"Predictions based on multi-cell inputs containing {ncell_per_sample} cells.")
        nmark = len(new_samples[0][1])
        # scale the new samples if we did that for the training samples
        if scaler is not None:
            new_samples = [scaler.transform(x) for x in new_samples]
        print(len(new_samples))
        print(type(new_samples))
        new_samples = [shuffle(x)[:ncell_per_sample].reshape(1, ncell_per_sample, nmark)
                           for x in new_samples]
        data_test = np.vstack(new_samples)
        return data_test,phenotypes,ncell_per_sample
def pool_top_k(x, k):
    return tf.reduce_mean(tf.sort(x, axis=1, direction='DESCENDING')[:, :k, :], axis=1)
def create_model (k,ncell,nfilter):
        data_input = keras.Input(shape=(ncell, 35))
        coeff_l1=0
        coeff_l2=1e-4
        n_classes=2
        # the filters
        conv = layers.Conv1D(filters=nfilter,
                             kernel_size=1,
                             activation='relu',
                             kernel_initializer=initializers.RandomUniform(),
                             kernel_regularizer=regularizers.l1_l2(l1=coeff_l1, l2=coeff_l2),
                             name='conv1')(data_input)

        # the cell grouping part (top-k pooling)
        pooled = layers.Lambda(pool_top_k, output_shape=(nfilter,), arguments={'k': k})(conv)
        output = layers.Dense(units=n_classes,
                                  activation='softmax',
                                  kernel_initializer=initializers.RandomUniform(),
                                  kernel_regularizer=regularizers.l1_l2(l1=coeff_l1, l2=coeff_l2),
                                  name='output')(pooled)
        model = keras.Model(inputs=data_input, outputs=output)

        model.compile(optimizer=optimizers.Adam(learning_rate=0.01),
                          loss='categorical_crossentropy',
                          metrics=['accuracy'])
        return model
def model_pred(prob):
    pred = []
    for p in prob:
        if p[0]>p[1]:
            pred.append(0)
        else:
            pred.append(1)
    return pred
def splitForLocal(nhosts,train_indices,test_indices):
    test_idx1 = test_indices[0]
    test_idx2 = test_indices[1]

    train_idx1 = train_indices[0]
    train_idx2 = train_indices[1]

    print("Test set indices:")
    print(test_idx1)
    print(test_idx2)
    print("Global train set indices:")

    #to take the runs on 10 different distributions (for box plot)

    print(train_idx1)
    print(train_idx2)

    #distribute train indices balanced among n hosts:
    split_idx_1 = []
    split_idx_2 = []
    group1_list = np.flip(np.array_split(numpy.array(train_idx1), nhosts))
    group2_list = numpy.array_split(numpy.array(train_idx2), nhosts)

    for i in range(nhosts):
        split_idx_1.append(group1_list[i].tolist())
        split_idx_2.append(group2_list[i].tolist())

    print("Global train splitted among hosts - indices:")
    print(split_idx_1)
    print(split_idx_2)

    xtr = []
    ytr= []
    for i in range(nhosts):
        print("\nHost no.", i, ":")
        folder_path = 'splitFlow' + str(nhosts) + '/host' + str(i) + '/'
        host_idx_1 = split_idx_1[i]
        host_idx_2 = split_idx_2[i]
        print("host_idx_1:", host_idx_1, "- host_idx_2:", host_idx_2)
         # load the training samples
        host_group1_list, host_group2_list = [], []
        train_samples,train_phenotypes = [],[]
        for idx in host_idx_1:
            x = data[idx][:]
            host_group1_list.append(x)

        for idx in host_idx_2:
            x = data[idx][:]
            host_group2_list.append(x)

        # finally prepare training and vallidation data
        cut = int(1 * len(host_group1_list))
        train_samples = host_group1_list[:cut] + host_group2_list[:cut]
        train_phenotypes = [0] * len(host_group1_list[:cut]) + [1] * len(host_group2_list[:cut])
        print(train_phenotypes)
        scaler,x_tr,y_tr = generate_data(train_samples, train_phenotypes, folder_path, scale=True, ncell=ncells, 
                      nsubset=int(ntrain_all/2),per_sample=False, verbose=0,generate_valid_set=False,
                                         saveFile=False,scaler=None,subset_selection = 'random',oneFile=None)
        xtr.append(x_tr)
        ytr.append(y_tr)
    return xtr,ytr,scaler

Using TensorFlow backend.


In [None]:
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

accurMulti = []
precMulti = []
recallMulti = []
fscoreMulti = []
accurPheno = []
precPheno = []
recallPheno = []
fscorePheno = []
for run in range(10):
    train_samples, train_phenotypes, test_samples, test_phenotypes, test_indices, train_indices = train_test_split()
    # generate ntrain_all (ntrain_all/2 per phenotype) training samples for centralized test!

    scaler,x_tr,y_tr = generate_data(train_samples, train_phenotypes, 'Flow/', generate_valid_set=False, 
                                                   ncell=ncells, nsubset=int(ntrain_all/2), scale=True, 
                                                   per_sample=False, verbose=0, saveFile=False,subset_selection = 'random')


    scaler,x_test,y_test = generate_data(test_samples, test_phenotypes, 'Flow/', generate_valid_set=False, 
                                                   ncell=ncells, nsubset=5000, scale=True, 
                                                   per_sample=False, verbose=0, saveFile=False,
                                                   subset_selection = 'random', generateAsTest=True)
    print(len(test_samples[0]))
    data_test,phenotypes,ncell_per_sample = generate_for_pheno_prediction(test_samples,test_phenotypes,scaler)

    y_tr_n = to_categorical(y_tr)


    model = create_model(ncells,ncells,nfilter)
    #generate data

    x_tr_n = x_tr.transpose(0,2,1)
    # Fit data to model
    history = model.fit(x_tr_n, y_tr_n,
                batch_size=64,
                epochs=30,
                verbose=2,
                validation_split=0)
    #For 100-cell predictions on test set

    
    #test for multi-cell
    x_test_n = x_test.transpose(0,2,1)
    x_test_n = x_test_n[0:ntest_all,:]

    y_test_n = to_categorical(y_test)
    y_test_n = y_test_n[0:ntest_all,:]
    loss, accuracy = model.evaluate(x_test_n, y_test_n, verbose=0)
    #score = model.evaluate(x_test_n, y_test_n, verbose=0)
    print("For 100-cell predictions on test set with size",x_test_n.shape)
    print(f'Test loss: {loss}, Test accuracy: {accuracy}')

    y_pred = model.predict(x_test_n)
    y_pred = model_pred(y_pred)
    y_test= y_test[0:ntest_all]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F-score:",f1_score(y_test, y_pred))
    print("precision:",precision_score(y_test, y_pred))
    print("recall:",recall_score(y_test, y_pred)) 
    
    #to write to excell sheet...
    accurMulti.append(accuracy_score(y_test, y_pred))
    precMulti.append(precision_score(y_test, y_pred))
    recallMulti.append(recall_score(y_test, y_pred)) 
    fscoreMulti.append(f1_score(y_test, y_pred))
    #For phenotype predictions on test set using all cells 

    model2 = create_model(ncell_per_sample, ncell_per_sample,nfilter)
    weights = model.get_weights()
    model2.set_weights(weights)
    data_test_n = data_test
    phenotypes_n = to_categorical(phenotypes)

    y_pred = model2.predict(data_test_n)
    y_pred = model_pred(y_pred)
    # print(y_pred)
    print("Accuracy:", accuracy_score(phenotypes, y_pred))
    print("F-score:",f1_score(phenotypes, y_pred))
    print("precision:",precision_score(phenotypes, y_pred))
    print("recall:",recall_score(phenotypes, y_pred)) 
    
    accurPheno.append(accuracy_score(phenotypes, y_pred))
    precPheno.append(precision_score(phenotypes, y_pred))
    recallPheno.append(recall_score(phenotypes, y_pred)) 
    fscorePheno.append(f1_score(phenotypes, y_pred))


test indices
[[0, 21, 31, 44, 57], [2, 4, 13, 24, 38, 42, 47]]
train indices
[53, 55, 1, 35, 48, 50, 19, 20, 36, 10, 54, 49, 51, 43, 18, 58, 52, 30, 26, 45, 56, 17, 27, 15]
[46, 11, 3, 16, 14, 28, 8, 33, 29, 22, 9, 32, 41, 23, 39, 6, 40, 7, 37, 59, 12, 34, 25, 5]
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
new scale
Generating multi-cell inputs...
new scale
Generating multi-cell inputs...
7172
Predictions based on multi-cell inputs containing 3247 cells.
12
<class 'list'>
Epoch 1/30
469/469 - 2s - loss: 0.4017 - accuracy: 0.8156
Epoch 2/30
469/469 - 2s - loss: 0.2879 - accuracy: 0.8815
Epoch 3/30
469/469 - 2s - loss: 0.2497 - accuracy: 0.9032
Epoch 4/30
469/469 - 2s - loss: 0.2282 - accuracy: 0.9134
Epoch 5/30
469/469 - 2s - loss: 0.2197 - accuracy: 0.9195
Epoch 6/30
469/469 - 2s - loss: 0.2098 - accuracy: 0.9255
Epoch 7/30
469/469 - 2s - loss: 0.2057 - accuracy: 0.9273
Epoch 8/30
469/469 - 2s - loss: 0.1976 - accuracy: 0.9306
Epoch 9/30
469/469 - 3s - loss: 0.1935 - accuracy: 0.9324
Epoch 10

Epoch 1/30
469/469 - 3s - loss: 0.4385 - accuracy: 0.7911
Epoch 2/30
469/469 - 2s - loss: 0.3243 - accuracy: 0.8628
Epoch 3/30
469/469 - 2s - loss: 0.2915 - accuracy: 0.8813
Epoch 4/30
469/469 - 2s - loss: 0.2695 - accuracy: 0.8920
Epoch 5/30
469/469 - 2s - loss: 0.2597 - accuracy: 0.8996
Epoch 6/30
469/469 - 2s - loss: 0.2519 - accuracy: 0.9035
Epoch 7/30
469/469 - 2s - loss: 0.2473 - accuracy: 0.9063
Epoch 8/30
469/469 - 2s - loss: 0.2417 - accuracy: 0.9091
Epoch 9/30
469/469 - 2s - loss: 0.2365 - accuracy: 0.9119
Epoch 10/30
469/469 - 2s - loss: 0.2288 - accuracy: 0.9176
Epoch 11/30
469/469 - 2s - loss: 0.2250 - accuracy: 0.9185
Epoch 12/30
469/469 - 2s - loss: 0.2229 - accuracy: 0.9188
Epoch 13/30
469/469 - 2s - loss: 0.2211 - accuracy: 0.9211
Epoch 14/30
469/469 - 2s - loss: 0.2203 - accuracy: 0.9204
Epoch 15/30
469/469 - 2s - loss: 0.2200 - accuracy: 0.9201
Epoch 16/30
469/469 - 2s - loss: 0.2167 - accuracy: 0.9231
Epoch 17/30
469/469 - 2s - loss: 0.2158 - accuracy: 0.9238
Epoch 

Epoch 9/30
469/469 - 3s - loss: 0.2360 - accuracy: 0.9129
Epoch 10/30
469/469 - 3s - loss: 0.2333 - accuracy: 0.9133
Epoch 11/30
469/469 - 3s - loss: 0.2303 - accuracy: 0.9162
Epoch 12/30
469/469 - 3s - loss: 0.2322 - accuracy: 0.9150
Epoch 13/30
469/469 - 2s - loss: 0.2241 - accuracy: 0.9185
Epoch 14/30
469/469 - 2s - loss: 0.2242 - accuracy: 0.9198
Epoch 15/30
469/469 - 2s - loss: 0.2234 - accuracy: 0.9197
Epoch 16/30
469/469 - 2s - loss: 0.2218 - accuracy: 0.9207
Epoch 17/30
469/469 - 2s - loss: 0.2201 - accuracy: 0.9217
Epoch 18/30
469/469 - 2s - loss: 0.2201 - accuracy: 0.9206
Epoch 19/30
469/469 - 2s - loss: 0.2179 - accuracy: 0.9225
Epoch 20/30
469/469 - 2s - loss: 0.2160 - accuracy: 0.9228
Epoch 21/30
469/469 - 2s - loss: 0.2162 - accuracy: 0.9237
Epoch 22/30
469/469 - 2s - loss: 0.2152 - accuracy: 0.9237
Epoch 23/30
469/469 - 2s - loss: 0.2151 - accuracy: 0.9237
Epoch 24/30
469/469 - 2s - loss: 0.2118 - accuracy: 0.9254
Epoch 25/30
469/469 - 2s - loss: 0.2120 - accuracy: 0.925

In [None]:
#To average LOCAL computations metrics
nhosts = 2
totalRun = 2
accurMulti = np.empty([nhosts, totalRun])
precMulti = np.empty([nhosts, totalRun])
recallMulti = np.empty([nhosts, totalRun])
fscoreMulti = np.empty([nhosts, totalRun])
accurPheno = np.empty([nhosts, totalRun])
precPheno = np.empty([nhosts, totalRun])
recallPheno = np.empty([nhosts, totalRun])
fscorePheno = np.empty([nhosts, totalRun])

for run in range(totalRun):
    train_samples, train_phenotypes, test_samples, test_phenotypes, test_indices, train_indices = train_test_split()
    
    #split between n host
    xtr,ytr,scaler = splitForLocal(nhosts,train_indices, test_indices)
    
    scaler,x_test,y_test = generate_data(test_samples, test_phenotypes, 'Flow/', generate_valid_set=False, 
                                                   ncell=ncells, nsubset=5000, scale=True, 
                                                   per_sample=False, verbose=0, saveFile=False,
                                                   subset_selection = 'random', generateAsTest=True,scaler=scaler)
    
    data_test,phenotypes,ncell_per_sample = generate_for_pheno_prediction(test_samples,test_phenotypes,scaler)


    for i in range(nhosts):
        y_tr_n = to_categorical(ytr[i])
        model = create_model(ncells,ncells,nfilter)
        #generate data

        x_tr_n = xtr[i].transpose(0,2,1)
        # Fit data to model
        history = model.fit(x_tr_n, y_tr_n,
                    batch_size=64,
                    epochs=30,
                    verbose=2,
                    validation_split=0)

        #test for multi-cell
        x_test_n = x_test.transpose(0,2,1)
        x_test_n = x_test_n[0:ntest_all,:]

        y_test_n = to_categorical(y_test)
        y_test_n = y_test_n[0:ntest_all,:]
        loss, accuracy = model.evaluate(x_test_n, y_test_n, verbose=0)
        #score = model.evaluate(x_test_n, y_test_n, verbose=0)
        print("For 100-cell predictions on test set with size",x_test_n.shape)
        print(f'Test loss: {loss}, Test accuracy: {accuracy}')

        y_pred = model.predict(x_test_n)
        y_pred = model_pred(y_pred)
        y_test= y_test[0:ntest_all]

        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("F-score:",f1_score(y_test, y_pred))
        print("precision:",precision_score(y_test, y_pred))
        print("recall:",recall_score(y_test, y_pred)) 

        #to write to excell sheet...
        accurMulti[i,run] = accuracy_score(y_test, y_pred)
        precMulti[i,run] = precision_score(y_test, y_pred)
        recallMulti[i,run]= recall_score(y_test, y_pred)
        fscoreMulti[i,run]= f1_score(y_test, y_pred)
        #For phenotype predictions on test set using all cells 

        model2 = create_model(ncell_per_sample, ncell_per_sample,nfilter)
        weights = model.get_weights()
        model2.set_weights(weights)
        data_test_n = data_test
        phenotypes_n = to_categorical(phenotypes)

        y_pred = model2.predict(data_test_n)
        y_pred = model_pred(y_pred)
        # print(y_pred)
        print("Accuracy:", accuracy_score(phenotypes, y_pred))
        print("F-score:",f1_score(phenotypes, y_pred))
        print("precision:",precision_score(phenotypes, y_pred))
        print("recall:",recall_score(phenotypes, y_pred)) 
        
        accurPheno[i,run] = accuracy_score(phenotypes, y_pred)
        precPheno[i,run] = precision_score(phenotypes, y_pred)
        recallPheno[i,run]= recall_score(phenotypes, y_pred)
        fscorePheno[i,run]= f1_score(phenotypes, y_pred)



In [None]:
import csv
import numpy

with open('myfile.csv', 'w', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    phen = np.sum(recallPheno[:,:],axis=0)/nhosts
    multi = np.sum(recallMulti[:,:],axis=0)/nhosts
    mywriter.writerows(map(lambda x: [x], np.asarray(phen)))
    mywriter.writerows(map(lambda x: [x], np.asarray(multi)))

In [None]:
import csv
import numpy

with open('myfile.csv', 'w', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    phenR = np.sum(recallPheno[:,:],axis=0)/nhosts
    multiR = np.sum(recallMulti[:,:],axis=0)/nhosts
        
    phenP = np.sum(precPheno[:,:],axis=0)/nhosts
    multiP = np.sum(precMulti[:,:],axis=0)/nhosts
    
    fscoreAvPhen = 2*phenR*phenP/(phenP+phenR)
    fscoreAvMulti = 2*multiR*multiP/(multiP+multiR)
    mywriter.writerows(map(lambda x: [x], np.asarray(fscoreAvPhen)))
    mywriter.writerows(map(lambda x: [x], np.asarray(fscoreAvMulti)))

In [None]:
import csv
import numpy

with open('myfile.csv', 'w', newline='') as file:
    mywriter = csv.writer(file, delimiter=',')
    phen = np.sum(recallPheno[:,:],axis=0)
    multi = np.sum(recallMulti[:,:],axis=0)
    mywriter.writerows(map(lambda x: [x], np.asarray(phen)))
    mywriter.writerows(map(lambda x: [x], np.asarray(multi)))

In [None]:
accurPheno

In [None]:
np.sum(accurPheno[:,:],axis=0)/4