# Testing/Benchmarking Celltypist Models 
### List of Models (made in Making New Models.ipynb)
1. Remove the feature selection from CellTypist (so it only trains the model once)
2. Train the model with L1 regularization instead of L2
3. Train the model only once with only Cytopus genes
4. At the feature selection step, make sure the Cytopus genes are included in the list of top genes

In [7]:
import scanpy as sc
import pandas as pd
import anndata as ad
from anndata import AnnData
import numpy as np
from scipy.sparse import spmatrix
from datetime import datetime
import itertools
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

import celltypist as ct
from celltypist import models

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [18]:
#Import models 
##Celltypist default model 
models.download_models(model = 'Immune_All_Low.pkl')

##New Models
model_1 = models.Model.load('New Models/CT_45 Models/ct_model_1.pkl')
model_2 = models.Model.load('New Models/CT_45 Models/ct_model_2.pkl')
model_3 = models.Model.load('New Models/CT_45 Models/ct_model_3.pkl')
model_4 = models.Model.load('New Models/CT_45 Models/ct_model_4.pkl')

📂 Storing models in /Users/labuser/.celltypist/data/models
💾 Total models to download: 1
⏩ Skipping [1/1]: Immune_All_Low.pkl (file exists)


In [102]:
models.download_models(model = 'Healthy_COVID19_PBMC.pkl')

📂 Storing models in /Users/labuser/.celltypist/data/models
💾 Total models to download: 1
💾 Downloading model [1/1]: Healthy_COVID19_PBMC.pkl


## Get celltype predictions from each model

### Using CT_45 Models

In [19]:
#Import test data - subset of Celltypist data 
test= ad.read('../../Data/Celltypist_test.h5ad')

In [141]:
predictions_ct = ct.annotate(test, model = 'New Models/CT_45 Models/ct_model_0.pkl', majority_voting = True)
predictions_ct.predicted_labels

🔬 Input data has 263810 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 4759 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 30
🗳️ Majority voting the predictions
✅ Majority voting done!


Unnamed: 0,predicted_labels,over_clustering,majority_voting
CZINY-0105_CACCAAAAGTCAACAA,Tem/emra_CD8,71,Tem/emra_CD8
CZINY-0109_TGCGATACACATAACC,Tem/emra_CD8,101,Tem/emra_CD8
CZINY-0058_GCATCGGCAAGTCATC,Tnaive/CM_CD4,7,Tnaive/CM_CD4
CZINY-0057_AACAACCTCATGCTAG,Memory B cells,18,Memory B cells
CZINY-0106_GTGTTCCAGTGTTGTC,Alveolar macrophages,56,Alveolar macrophages
...,...,...,...
CZINY-0104_TCCTCTTTCCGTTTCG,Tfh,206,Teffector/EM_CD4
Pan_T7980364_CGTGTAACATGCTGGC,Memory B cells,380,Memory B cells
CZINY-0104_CGGCAGTTCCATGCAA,Tem/emra_CD8,369,Trm/em_CD8
CZINY-0102_CGAAGTTAGCACTAGG,Tfh,3,Tfh


In [142]:
pred_adatact = predictions_ct.to_adata()
pred_adatact.write_h5ad('../../pred_modelct.h5ad')

Model 1

In [None]:
predictions_1 = ct.annotate(test, model = 'New Models/CT_45 Models/ct_model_1.pkl', majority_voting = True)
predictions_1.predicted_labels

🔬 Input data has 263810 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 30123 features used for prediction
⚖️ Scaling input data


In [None]:
pred_adata1 = predictions_1.to_adata()
pred_adata1.write_h5ad('../../pred_model1.h5ad')

Model 2

In [4]:
predictions_2 = ct.annotate(test, model = 'New Models/CT_45 Models/ct_model_2.pkl', majority_voting = True)
predictions_2.predicted_labels

🔬 Input data has 263810 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 6749 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 30
🗳️ Majority voting the predictions
✅ Majority voting done!


Unnamed: 0,predicted_labels,over_clustering,majority_voting
CZINY-0105_CACCAAAAGTCAACAA,Tem/emra_CD8,71,Tem/emra_CD8
CZINY-0109_TGCGATACACATAACC,Tem/emra_CD8,101,Tem/emra_CD8
CZINY-0058_GCATCGGCAAGTCATC,Tnaive/CM_CD4,7,Tnaive/CM_CD4
CZINY-0057_AACAACCTCATGCTAG,Memory B cells,18,Memory B cells
CZINY-0106_GTGTTCCAGTGTTGTC,Alveolar macrophages,56,Alveolar macrophages
...,...,...,...
CZINY-0104_TCCTCTTTCCGTTTCG,Tfh,206,Teffector/EM_CD4
Pan_T7980364_CGTGTAACATGCTGGC,Memory B cells,380,Memory B cells
CZINY-0104_CGGCAGTTCCATGCAA,Tem/emra_CD8,369,Trm/em_CD8
CZINY-0102_CGAAGTTAGCACTAGG,Tfh,3,Tfh


In [5]:
pred_adata2 = predictions_2.to_adata()

In [3]:
#pred_adata2.write_h5ad('../../pred_model2.h5ad')
pred_adata2= ad.read('../../pred_model2.h5ad')

Model 3

In [6]:
predictions_3 = ct.annotate(test, model = 'New Models/CT_45 Models/ct_model_3.pkl', majority_voting = True)
predictions_3.predicted_labels

🔬 Input data has 263810 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 304 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 30
🗳️ Majority voting the predictions
✅ Majority voting done!


Unnamed: 0,predicted_labels,over_clustering,majority_voting
CZINY-0105_CACCAAAAGTCAACAA,Tem/emra_CD8,71,Tem/emra_CD8
CZINY-0109_TGCGATACACATAACC,Tem/emra_CD8,101,Tem/emra_CD8
CZINY-0058_GCATCGGCAAGTCATC,Tnaive/CM_CD4,7,Tnaive/CM_CD4
CZINY-0057_AACAACCTCATGCTAG,Memory B cells,18,Memory B cells
CZINY-0106_GTGTTCCAGTGTTGTC,Alveolar macrophages,56,Alveolar macrophages
...,...,...,...
CZINY-0104_TCCTCTTTCCGTTTCG,Tfh,206,Teffector/EM_CD4
Pan_T7980364_CGTGTAACATGCTGGC,Memory B cells,380,Memory B cells
CZINY-0104_CGGCAGTTCCATGCAA,Trm/em_CD8,369,Trm/em_CD8
CZINY-0102_CGAAGTTAGCACTAGG,Tfh,3,Tfh


In [7]:
pred_adata3 = predictions_3.to_adata()

In [4]:
#pred_adata3.write_h5ad('../../pred_model3.h5ad')
pred_adata3= ad.read('../../pred_model3.h5ad')

Model 4

In [152]:
predictions_4 = ct.annotate(test, model = 'New Models/CT_45 Models/ct_model_4.pkl', majority_voting = True)
predictions_4.predicted_labels

🔬 Input data has 263810 cells and 36601 genes
🔗 Matching reference genes in the model
🧬 4804 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 30
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
🗳️ Majority voting the predictions
✅ Majority voting done!


Unnamed: 0,predicted_labels,over_clustering,majority_voting
CZINY-0105_CACCAAAAGTCAACAA,Tem/emra_CD8,71,Tem/emra_CD8
CZINY-0109_TGCGATACACATAACC,Tem/emra_CD8,101,Tem/emra_CD8
CZINY-0058_GCATCGGCAAGTCATC,Tnaive/CM_CD4,7,Tnaive/CM_CD4
CZINY-0057_AACAACCTCATGCTAG,Memory B cells,18,Memory B cells
CZINY-0106_GTGTTCCAGTGTTGTC,Alveolar macrophages,56,Alveolar macrophages
...,...,...,...
CZINY-0104_TCCTCTTTCCGTTTCG,Tfh,206,Teffector/EM_CD4
Pan_T7980364_CGTGTAACATGCTGGC,Memory B cells,380,Memory B cells
CZINY-0104_CGGCAGTTCCATGCAA,Tem/emra_CD8,369,Trm/em_CD8
CZINY-0102_CGAAGTTAGCACTAGG,Tfh,3,Tfh


In [153]:
pred_adata4 = predictions_4.to_adata()

In [154]:
#pred_adata4.write_h5ad('../../pred_model4.h5ad')
pred_adata4= ad.read('../../pred_model4.h5ad')

### Using CT_98 Models

In [112]:
#Import test data - subset of CT_98 data 
test_98= ad.read('../../Data/CT_98_Test.h5ad')

Model 1

In [None]:
predictions_98_0 = ct.annotate(test_98, model = 'New Models/CT_98 Models/98_model_0.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adata98_0 = predictions_98_0.to_adata()
pred_adata98_0.write_h5ad('../../predictions/pred_98_model0.h5ad')

🔬 Input data has 540487 cells and 38995 genes
🔗 Matching reference genes in the model
🧬 7805 features used for prediction
⚖️ Scaling input data


Model 2

In [None]:
predictions_98_2 = ct.annotate(test_98, model = 'New Models/CT_98 Models/98_model_2.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adata98_2 = predictions_98_2.to_adata()
pred_adata98_2.write_h5ad('../../predictions/pred_98_model2.h5ad')

Model 3

In [None]:
predictions_98_3 = ct.annotate(test_98, model = 'New Models/CT_98 Models/98_model_3.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adata98_3 = predictions_98_3.to_adata()
pred_adata98_3.write_h5ad('../../predictions/pred_98_model3.h5ad')

Model 4

In [None]:
predictions_98_4 = ct.annotate(test_98, model = 'New Models/CT_98 Models/98_model_4.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adata98_4 = predictions_98_4.to_adata()
pred_adata98_4.write_h5ad('../../predictions/pred_98_model4.h5ad')

### Using COV_PBMC Models

In [103]:
#Import test data - subset of CT_98 data 
test_COV= ad.read('../../Data/test_COV.h5ad')
#test_COV_cp = ad.read('../../Data/test_COV_cp.h5ad')

Model 2

In [109]:
predictions_COV_2 = ct.annotate(test_COV, model = 'New Models/COV_PBMC Models/COV_model_2.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adataCOV_2 = predictions_COV_2.to_adata()
pred_adataCOV_2.write_h5ad('../../predictions/pred_COV_model2.h5ad')

🔬 Input data has 517893 cells and 24737 genes
🔗 Matching reference genes in the model
🧬 5968 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Detected a neighborhood graph in the input object, will run over-clustering on the basis of it
⛓️ Over-clustering input data with resolution set to 30
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
🗳️ Majority voting the predictions
✅ Majority voting done!


Model 3

In [105]:
predictions_COV_3 = ct.annotate(test_COV, model = 'New Models/COV_PBMC Models/COV_model_3.pkl', majority_voting = True)
#predictions_COV_4.predicted_labels
pred_adataCOV_3 = predictions_COV_3.to_adata()
pred_adataCOV_3.write_h5ad('../../predictions/pred_COV_model3.h5ad')

In [107]:
#making sure the f1 score is the same if we use dataset with all genes vs just cytopus genes 
#predictions_COV_cp_3 = ct.annotate(test_COV_cp, model = 'New Models/COV_PBMC Models/COV_model_3.pkl', majority_voting = True)
#predictions_COV_3.predicted_labels
#pred_adataCOV_cp_3 = predictions_COV_cp_3.to_adata()
#pred_adataCOV_cp_3.write_h5ad('../../predictions/pred_COV_cp_model3.h5ad')

🔬 Input data has 517893 cells and 300 genes
🔗 Matching reference genes in the model
🧬 298 features used for prediction
⚖️ Scaling input data
🖋️ Predicting labels
✅ Prediction done!
👀 Can not detect a neighborhood graph, will construct one before the over-clustering
⛓️ Over-clustering input data with resolution set to 30
IOStream.flush timed out
IOStream.flush timed out
🗳️ Majority voting the predictions
✅ Majority voting done!


Model 4

In [None]:
predictions_COV_4 = ct.annotate(test_COV, model = 'New Models/COV_PBMC Models/COV_model_4.pkl', majority_voting = True)
#predictions_4.predicted_labels
pred_adataCOV_4 = predictions_COV_4.to_adata()
pred_adataCOV_4.write_h5ad('../../predictions/pred_COV_model4.h5ad')

## Benchmarking
### F1 scores
Abdelaal et al. used median F1 scores as their primary statistic

#### Train & Test on CT_45

In [41]:
pred_adatact.obs["predicted_labels"]

CZINY-0105_CACCAAAAGTCAACAA      Tem/Temra cytotoxic T cells
CZINY-0109_TGCGATACACATAACC      Tem/Temra cytotoxic T cells
CZINY-0058_GCATCGGCAAGTCATC         Tcm/Naive helper T cells
CZINY-0057_AACAACCTCATGCTAG                   Memory B cells
CZINY-0106_GTGTTCCAGTGTTGTC             Alveolar macrophages
                                            ...             
CZINY-0104_TCCTCTTTCCGTTTCG        Follicular helper T cells
Pan_T7980364_CGTGTAACATGCTGGC                 Memory B cells
CZINY-0104_CGGCAGTTCCATGCAA      Tem/Temra cytotoxic T cells
CZINY-0102_CGAAGTTAGCACTAGG        Follicular helper T cells
CZINY-0061_CGGGACTAGAGGCGGA                       MAIT cells
Name: predicted_labels, Length: 263810, dtype: category
Categories (83, object): ['Age-associated B cells', 'Alveolar macrophages', 'B cells', 'CD16+ NK cells', ..., 'Type 1 helper T cells', 'Type 17 helper T cells', 'gamma-delta T cells', 'pDC']

In [143]:
#og celltypist  - 0.892
np.median(f1_score(pred_adatact.obs["Manually_curated_celltype"], pred_adatact.obs["predicted_labels"], average=None))

0.8924290052746551

In [None]:
#model 1 - cant get it to run, gets stuck on Scaling for too long 
#np.median(f1_score(pred_adata1.obs["Manually_curated_celltype"], pred_adata1.obs["predicted_labels"], average=None))

In [39]:
#model 2  - 0.74
np.median(f1_score(pred_adata2.obs["Manually_curated_celltype"], pred_adata2.obs["predicted_labels"], average=None))

0.7401985360473279

In [38]:
#model 3  - 0.79
np.median(f1_score(pred_adata3.obs["Manually_curated_celltype"], pred_adata3.obs["predicted_labels"], average=None))

0.7864401520440618

In [158]:
#model 4 - 0.887
np.median(f1_score(pred_adata4.obs["Manually_curated_celltype"], pred_adata4.obs["predicted_labels"], average = None))

0.8866913027637537

#### Train & Test on CT_98

In [None]:
#og celltypist  - 
np.median(f1_score(pred_adata98_0.obs["Harmonised_detailed_type"], pred_adata98_0.obs["predicted_labels"], average=None))

In [None]:
#model 2  - 
np.median(f1_score(pred_adata98_2.obs["Harmonised_detailed_type"], pred_adata98_2.obs["predicted_labels"], average=None))

In [None]:
#model 3  - 
np.median(f1_score(pred_adata98_3.obs["Harmonised_detailed_type"], pred_adata98_3.obs["predicted_labels"], average=None))

In [None]:
#model 4 - 
np.median(f1_score(pred_adata98_4.obs["Harmonised_detailed_type"], pred_adata98_4.obs["predicted_labels"], average = None))

#### Train & Test on COV_45

In [None]:
#og celltypist model - 
np.median(f1_score(pred_adataCOV_0.obs["full_clustering"], pred_adatact.obs["predicted_labels"], average=None))

In [110]:
#model 2  - 
np.median(f1_score(pred_adataCOV_2.obs["full_clustering"], pred_adataCOV_2.obs["predicted_labels"], average=None))

0.5556809631301732

In [106]:
#model 3 - 0.507
np.median(f1_score(pred_adataCOV_3.obs["full_clustering"], pred_adataCOV_3.obs["predicted_labels"], average=None))

0.5069204152249135

In [108]:
#model 3 cytopus genes dataset to make sure they are the same - 0.507
np.median(f1_score(pred_adataCOV_cp_3.obs["full_clustering"], pred_adataCOV_cp_3.obs["predicted_labels"], average=None))

0.5069204152249135

In [None]:
#model 4 - 
np.median(f1_score(pred_adataCOV_4.obs["full_clustering"], pred_adataCOV_4.obs["predicted_labels"], average = None))

In [70]:
for j in range(100):
    x = np.sum(np.expm1(adata.X[j]).toarray().flatten() )
    if x > 10000:
        print(x)

10004.135
10010.176
10009.035
10000.411
10002.444
10007.5
10002.46
10006.559
10003.9795
10000.707
10001.528
10012.495
10008.619
10004.193
10002.339
10004.573
10007.215
10001.952
10005.627
10009.004
10002.221
10173.28
10004.584
10005.443
10000.713
10003.491
10004.548
10006.23
10004.139
10003.272
10010.927
10011.751
10005.096
10002.011
