# Zero Shot Evaluation Tutorial

## Step 1: Setup

In [None]:
# Download example dataset (pbmc20k) from: https://drive.google.com/file/d/1YID1c2ls8oT29gaf6GKctWutSo4tytsY/view?usp=sharing
# Set up the environment.
from embedding_extractors.setup import download_scgpt_model, download_geneformer_model, download_genept_data

download_scgpt_model()
download_geneformer_model()
download_genept_data()

## Step 2: Extract cell embeddings

#### a) Pre process raw expression metrics

In [17]:
# Pre-process filtered cell expression metrics (no further processes are needed).

# Set up configurations for the pre-processor, example configs can be found in `embedding_extractors/config.py`.
preprocessor_configs = dict(
    # The directory to store raw data.
    # Note: Currently only accept Anndata files with gene names in `vars`. The supported gene naming system includes
    # "gene_symbol", "ensembl_id", "entrez_id", "refseq_id".
    raw_data_directory='pbmc20k/raw_data',
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc20k/pre_processed',
    # The column name of gene ID in `adata.var`. If the gene ID is the index, input `index`.
    gene_id_col_name='gene_ids',
    # The type of gene naming system in the gene ID: {"gene_symbol", "ensembl_id", "entrez_id", "refseq_id"}.
    gene_id_type='ensembl_id',
    # The input file format. Currently only Anndata is supported.
    file_format='h5ad',
    # Whether to keep batch key. If true, the input file directory name will be used as the batch key and stored under
    # `adata.obs.batch_key`.
    keep_batch_key=False,
    # Map of cell attribute labels in `obs` to keep. Key is the name in original file, value is the name in
    # pre-processed file. If none, use empty map {}.
    custom_cell_attr_names={},
)

In [15]:
from embedding_extractors import pre_processor
processor = pre_processor.PreProcessor()
processor.pre_process(preprocessor_configs)

Pre-processing pbmc20k\raw_data\pbmc20k.h5ad
Pre-process completed: pbmc20k/pre_processed\pbmc20k.h5ad. Shape:
AnnData object with n_obs × n_vars = 23837 × 36601
    obs: 'n_counts'
    var: 'key_0', 'gene_ids', 'feature_types', 'genome', 'ensembl_id', 'gene_symbol', 'entrez_id', 'refseq_id'
Successfully pre-processed 1 out of 1 files.


#### b) Tokenize and extract cell embeddings

In [2]:
# Generate cell embeddings with Geneformer.
"""
Geneformer configs
"""
geneformer_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc20k/pre_processed',
    # The output tokenized file directory.
    tokenized_file_directory="pbmc20k/geneformer/tokenized",
    # The output tokenized filename prefix.
    tokenized_file_prefix='tokenized',
    # List of cell attribute labels to keep. If none, use empty list [].
    custom_cell_attr_names=[],
    # The output embedding file directory.
    embedding_output_directory="pbmc20k",
    # The output embedding file name.
    embedding_output_filename="geneformer_cell_embeddings"
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("Geneformer", output_file_type='h5ad', configs=geneformer_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Tokenizing pbmc20k\pre_processed\pbmc20k.h5ad


100%|██████████| 47/47 [00:48<00:00,  1.03s/it]


pbmc20k\pre_processed\pbmc20k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map: 100%|██████████| 23837/23837 [00:14<00:00, 1625.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 23837/23837 [00:00<00:00, 324573.34 examples/s]


Tokenization completed for Geneformer.
Extracting Geneformer embeddings


  2%|▏         | 50/2384 [01:12<56:40,  1.46s/it] 


KeyboardInterrupt: 

In [None]:
# Generate cell embeddings with scGPT.
"""
scGPT configs
"""
scgpt_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc20k/pre_processed',
    # The output embedding file directory.
    embedding_output_directory="pbmc20k",
    # The output embedding file name.
    embedding_output_filename="scgpt_cell_embeddings",
    # List of cell attribute labels to keep. If none, use empty list [].
    custom_cell_attr_names=[],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("scGPT", output_file_type='h5ad', configs=scgpt_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Extracting scGPT embeddings
Embedding pbmc20k/pre_processed\pbmc20k.h5ad


Embedding cells:  18%|█▊        | 66/373 [00:10<00:48,  6.33it/s]

In [1]:
# Generate cell embeddings with GenePT-w.

"""
genePT configs
"""
genept_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc20k/pre_processed',
    # The output embedding file directory for genePT-w.
    genept_w_embedding_output_directory="pbmc20k",
    # The output embedding file name.
    embedding_output_filename="genept_w_cell_embeddings",
    # List of cell attribute labels to keep. If none, use empty list [].
    custom_cell_attr_names=[],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("genePT-w", output_file_type='h5ad', configs=genept_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


Extracting genePT-W embeddings
Embedding pbmc20k/pre_processed\pbmc20k.h5ad
Unable to match 15384 out of 36601 genes in pbmc20k/pre_processed\pbmc20k.h5ad
Output embedding in pbmc20k\genept_w_cell_embeddings.h5ad



In [None]:
# Generate cell embeddings with GenePT-s.

"""
genePT configs
"""
genept_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc20k/pre_processed',
    # The output embedding file directory for genePT-s.
    genept_s_embedding_output_directory="pbmc20k",
    # The used openai model name.
    genept_s_openai_model_name='text-embedding-ada-002',
    # The output embedding file name.
    embedding_output_filename="pbmc20k_s_cell_embeddings",
    # OpenAI api key.
    openai_api_key='',  # remember to set your open AI API key!
    # List of cell attribute labels to keep. If none, use empty list [].
    custom_cell_attr_names=[],
)
from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("genePT-s", output_file_type='h5ad', configs=genept_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()