### (1) Load Required Packages

In [1]:
import sys
sys.path.insert(0, '/home/hajar/Dropbox/phd1/CyMET/scripts')
from prediction import predict_cells
from data_merging import *

In [2]:
from tensorflow.keras.models import model_from_json
import numpy as np
import pandas as pd
import glob

### (2) Load Data

In [3]:
names = ['arthur','Z2RC']
frame = combined_frame('/home/hajar/Dropbox/phd1/CyMET/data/', names)

(1) importing files ...
['/home/hajar/Dropbox/phd1/CyMET/data/Z2RC.csv', '/home/hajar/Dropbox/phd1/CyMET/data/arthur.csv', '/home/hajar/Dropbox/phd1/CyMET/data/data_corrected.csv']
files imported!


(2) combining files ...
   processing: arthur
   processing: Z2RC
      CD66b is missing - replaced with 0
files combined!


### (3) Load Immunopred Model

In [4]:
from tensorflow.keras.models import model_from_json #load required packages
json_file = open('/home/hajar/Dropbox/phd1/CyMET/immunopred/model.json', 'r') #open the model
model_tmp = json_file.read() #read the file
json_file.close() #close the file

model = model_from_json(model_tmp) # load model weights
model.load_weights('/home/hajar/Dropbox/phd1/CyMET/immunopred/model_weights.h5')

print("Immunopred Successfully Loaded!")

Immunopred Successfully Loaded!


### (4) Run Cell Prediction

In [5]:
samp_df = frame#.sample(100).reset_index(drop=True)
y_pred = predict_cells(samp_df,model) #gives you the prediction confidence for each cell type

preds = y_pred.argmax(axis=1) #gives you the actual cell type with highest confidence
samp_df['cell_type'] = preds # map the cell type number to name!

#get the confidence score for final prediction:
scores = []
for x,y in zip(samp_df.index, preds):
    sc = y_pred[x][y] #index and then the position of the highest score
    scores.append(sc)
samp_df['score'] = scores
samp_df

['arthur', 'Z2RC']
Predicting:  arthur | 1  out of  2
number of cells in training: 1508676
      done.


Predicting:  Z2RC | 2  out of  2
number of cells in training: 3907569
      done.


Cells successfully predicted!


Unnamed: 0,CD3,CD19,HLA-DR,CD14,CD56,CD16,CD4,CD8,CD66b,CD27,study,file,cell_type,score
0,0.000000,0.000000,0.382163,2.298796,0.295033,3.731697,0.000000,0.000000,2.901617,0.192882,arthur,AP-05_CyTOF_CD45,7,0.930966
1,0.000000,0.078171,0.000000,0.544585,2.909250,3.501502,0.000000,0.049034,0.000000,0.740760,arthur,AP-05_CyTOF_CD45,10,0.991657
2,0.000000,0.684283,0.000000,1.557305,0.310810,3.959321,0.000000,0.000000,2.165657,0.027645,arthur,AP-05_CyTOF_CD45,7,0.994675
3,4.994442,0.000000,0.000000,0.360960,0.869215,0.291934,3.035438,0.543323,0.000000,3.297353,arthur,AP-05_CyTOF_CD45,1,0.999944
4,3.275756,0.000000,0.000000,0.950321,0.000000,0.000000,0.116122,2.381737,0.222947,2.018567,arthur,AP-05_CyTOF_CD45,2,0.977643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5416240,0.000000,4.449420,4.143681,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.856897,Z2RC,r/200828_Barcode_15_CV54,3,0.729500
5416241,1.001607,0.000000,0.000000,0.447850,0.732332,2.729694,0.000000,0.000000,0.000000,0.000000,Z2RC,r/200828_Barcode_15_CV54,12,0.753993
5416242,0.092931,0.000000,0.000000,0.457852,0.335129,2.635436,0.000000,0.000000,0.000000,0.000000,Z2RC,r/200828_Barcode_15_CV54,12,0.999557
5416243,3.632246,0.000000,0.091371,0.000000,0.000000,0.557104,3.598219,0.003107,0.000000,3.889644,Z2RC,r/200828_Barcode_15_CV54,1,0.999920


### (5) Summarise predictions at the sample level

In [6]:
### count the number of healthy and disease samples
hea = []; cov = []; 
for F in samp_df.file.unique():
    if ('COV' in F) or ('_CV' in F) or ('AP' in F) or ('Covid' in F):
        cov.append(F)
    else:
        hea.append(F)
        
print('number of healthy samples: ',len(hea))
print('number of disease samples: ',len(cov))

### append the condition names to each row in the dataframe
nam = []
for F in samp_df.file:
    if F in cov:
        nam.append('covid')
    else:
        nam.append('healthy')

samp_df['state'] = nam
samp_df.head(5)

number of healthy samples:  44
number of disease samples:  27


Unnamed: 0,CD3,CD19,HLA-DR,CD14,CD56,CD16,CD4,CD8,CD66b,CD27,study,file,cell_type,score,state
0,0.0,0.0,0.382163,2.298796,0.295033,3.731697,0.0,0.0,2.901617,0.192882,arthur,AP-05_CyTOF_CD45,7,0.930966,covid
1,0.0,0.078171,0.0,0.544585,2.90925,3.501502,0.0,0.049034,0.0,0.74076,arthur,AP-05_CyTOF_CD45,10,0.991657,covid
2,0.0,0.684283,0.0,1.557305,0.31081,3.959321,0.0,0.0,2.165657,0.027645,arthur,AP-05_CyTOF_CD45,7,0.994675,covid
3,4.994442,0.0,0.0,0.36096,0.869215,0.291934,3.035438,0.543323,0.0,3.297353,arthur,AP-05_CyTOF_CD45,1,0.999944,covid
4,3.275756,0.0,0.0,0.950321,0.0,0.0,0.116122,2.381737,0.222947,2.018567,arthur,AP-05_CyTOF_CD45,2,0.977643,covid


In [9]:
### convert data from wide to long format ready for statistical analysis (in R)
filt = samp_df #[final_df.score >= 0.8] #cant set one filtering threshold, its individual for cells
tokeep = ['cell_type', 'state', 'file']
temp = filt[tokeep].groupby(['file', 'cell_type']).agg(['count'])
temp = temp / temp.groupby(level=0).sum()

## sort out the data in the right format
prop = list(temp.loc[:, 'state']['count'])
file = list(temp.index.get_level_values(0))
nam2 = []
for F in file:
    if F in cov:
        nam2.append('covid')
    else:
        nam2.append('healthy')
pred = list(temp.index.get_level_values(1))
state = nam2

proportion_df = pd.DataFrame(list(zip(prop, file, pred, state)), columns =['len', 'file', 'cluster', 'supp'])
proportion_df.sample(10)

Unnamed: 0,len,file,cluster,supp
348,0.034384,r/200522_Barcode_2_A04,0,healthy
542,0.080056,r/200624_Barcode_6_CV16,9,covid
168,0.093619,HC-04_CyTOF_CD45,12,healthy
530,0.012234,r/200624_Barcode_6_C12,6,healthy
282,0.337211,r/200514_Barcode_1_D01,1,healthy
491,0.024003,r/200605_Barcode_4_E08,0,healthy
568,0.037313,r/200714_Barcode_9_CV29,0,covid
129,0.199484,HC-01_CyTOF_CD45,12,healthy
790,0.04569,r/200828_Barcode_15_CV55,2,covid
353,0.033864,r/200522_Barcode_2_A04,5,healthy
