# Extract Cell Embeddings w. Different Foundation Models

In [1]:
# Run this cell first to run the script as the root directory.
import os
import sys

project_root = os.path.dirname(os.path.abspath(''))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

## Step 1: Setup

In [1]:
# Set up the environment.
from embedding_extractors import download_scgpt_model, download_geneformer_model, download_genept_data

download_scgpt_model()
download_geneformer_model()
download_genept_data()

✅ File downloaded to: embedding_extractors/models/scGPT/model/
✅ File downloaded to: embedding_extractors/models/geneformer/model/gf-20L-95M-i4096
✅ File downloaded to: embedding_extractors/models/genePT/model/


## Step 2: Extract cell embeddings

#### a) Pre process raw expression metrics

In [19]:
# Pre-process filtered cell expression metrics (no further processes are needed).

# Set up configurations for the pre-processor, example configs can be found in `embedding_extractors/config.py`.
preprocessor_configs = dict(
    # The directory to store raw data.
    # Note: Currently only accept Anndata files with gene names in `vars`. The supported gene naming system includes
    # "gene_symbol", "ensembl_id", "entrez_id", "refseq_id".
    raw_data_directory='../pbmc_12k',
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_masked_80/pre_processed',
    # The gene info table， which stores the mapping between different gene naming systems.
    # Default is 'expanded_gene_info_table.csv'.
    gene_info_table='../embedding_extractors/data/expanded_gene_info_table.csv',
    # The column name of gene ID in `adata.var`. If the gene ID is the index, input `index`.
    gene_id_col_name='index',
    # The type of gene naming system in the gene ID: {"gene_symbol", "ensembl_id", "entrez_id", "refseq_id"}.
    gene_id_type='ensembl_id',
    # The input file format. Currently only Anndata is supported.
    file_format='h5ad',
    # Whether to keep batch key. If true, the input file directory name will be used as the batch key and stored under
    # `adata.obs.batch_key`.
    keep_batch_key=True,
    # Subsample ratio of raw gene expressions, range (0, 1]. If the value is 1, all gene expressions will be included.
    gene_expression_subsample_ratio=0.2,
    # Map of cell attribute labels in `obs` to keep. Key is the name in original file, value is the name in
    # pre-processed file. If none, use empty map {}.
    custom_cell_attr_names={'str_labels': 'cell_type'},
)

In [20]:
from embedding_extractors import pre_processor

processor = pre_processor.PreProcessor()
processor.pre_process(preprocessor_configs)

Pre-processing ..\pbmc_12k\pbmc_12k.h5ad
Pre-process completed: ../pbmc_masked_80/pre_processed\pbmc_12k.h5ad. Shape:
AnnData object with n_obs × n_vars = 11990 × 3346
    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'cell_type', 'batch_key', 'original_n_counts'
    var: 'key_0', 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts', 'ensembl_id', 'gene_symbol', 'entrez_id', 'refseq_id'
    uns: 'cell_types'
    obsm: 'design', 'normalized_qc', 'qc_pc', 'raw_qc'
Successfully pre-processed 1 out of 1 file(s).


#### b) Tokenize and extract cell embeddings

In [19]:
# Generate cell embeddings with Geneformer.
"""
Geneformer configs
"""
geneformer_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc_masked_80/pre_processed',
    # The output tokenized file directory.
    tokenized_file_directory="pbmc_masked_80/tokenized",
    # The output tokenized filename prefix.
    tokenized_file_prefix='geneformer',
    # The output embedding file directory.
    embedding_output_directory="pbmc_masked_80",
    # The output embedding file name.
    embedding_output_filename="geneformer_cell_embeddings",
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("Geneformer", output_file_type='h5ad', configs=geneformer_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Tokenizing pbmc\pre_processed\pbmc_12k.h5ad


100%|██████████| 24/24 [00:00<00:00, 51.50it/s]


pbmc\pre_processed\pbmc_12k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing pbmc\pre_processed\pbmc_59k.h5ad


100%|██████████| 117/117 [00:22<00:00,  5.12it/s]


pbmc\pre_processed\pbmc_59k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing pbmc\pre_processed\pbmc_67k.h5ad


100%|██████████| 131/131 [00:09<00:00, 14.36it/s]


pbmc\pre_processed\pbmc_67k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map: 100%|██████████| 138481/138481 [00:47<00:00, 2939.04 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 138481/138481 [00:00<00:00, 668459.81 examples/s]


Tokenization completed for Geneformer.
Extracting Geneformer embeddings


100%|██████████| 13849/13849 [1:15:04<00:00,  3.07it/s]


Output embedding in pbmc\geneformer_cell_embeddings.h5ad



In [11]:
# Generate cell embeddings with scGPT.
"""
scGPT configs
"""
scgpt_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_masked_80/pre_processed',
    # The output embedding file directory.
    embedding_output_directory="../pbmc_masked_80",
    # The output embedding file name.
    embedding_output_filename="scgpt_cell_embeddings",
    # Directory of the scGPT pre-trained model.
    load_model_dir='../embedding_extractors/models/scGPT/model/',
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("scGPT", output_file_type='h5ad', configs=scgpt_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Extracting scGPT embeddings
Embedding ../pbmc_masked_80/pre_processed\pbmc_12k.h5ad


Embedding cells: 100%|██████████| 188/188 [00:23<00:00,  8.04it/s]


Embedding ../pbmc_masked_80/pre_processed\pbmc_59k.h5ad


Embedding cells: 100%|██████████| 930/930 [02:22<00:00,  6.55it/s]


Embedding ../pbmc_masked_80/pre_processed\pbmc_67k.h5ad


Embedding cells: 100%|██████████| 1047/1047 [02:39<00:00,  6.56it/s]


Extract embeddings from 3 files. Output embedding in ../pbmc_masked_80\scgpt_cell_embeddings.h5ad


In [12]:
# Generate cell embeddings with GenePT-w.

"""
genePT configs
"""
genept_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_masked_80/pre_processed',
    # The output embedding file directory for genePT-w.
    genept_w_embedding_output_directory="../pbmc_masked_80",
    # Directory of the genePT pre-trained gene embedding file.
    load_model_dir='../embedding_extractors/models/genePT/model/',
    # The output embedding file name.
    embedding_output_filename="genept_w_cell_embeddings",
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("genePT-w", output_file_type='h5ad', configs=genept_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Extracting genePT-W embeddings
Embedding ../pbmc_masked_80/pre_processed\pbmc_12k.h5ad
Unable to match 24 out of 3346 genes in ../pbmc_masked_80/pre_processed\pbmc_12k.h5ad
Embedding ../pbmc_masked_80/pre_processed\pbmc_59k.h5ad
Unable to match 6313 out of 23948 genes in ../pbmc_masked_80/pre_processed\pbmc_59k.h5ad
Embedding ../pbmc_masked_80/pre_processed\pbmc_67k.h5ad
Unable to match 15196 out of 36263 genes in ../pbmc_masked_80/pre_processed\pbmc_67k.h5ad
Extract embeddings from 3 files. Output embedding in ../pbmc_masked_80\genept_w_cell_embeddings.h5ad



In [None]:
# Generate cell embeddings with GenePT-s.

"""
genePT configs
"""
genept_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='pbmc_masked_80/pre_processed',
    # The output embedding file directory for genePT-s.
    genept_s_embedding_output_directory="pbmc_masked_80",
    # The used openai model name.
    genept_s_openai_model_name='text-embedding-ada-002',
    # The output embedding file name.
    embedding_output_filename="genept_s_cell_embeddings",
    # OpenAI api key.
    openai_api_key='',  # remember to set your open AI API key!
    openai_api_max_threads=10,
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key'],
)
from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("genePT-s", output_file_type='h5ad', configs=genept_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


Extracting genePT-s embeddings
Embedding pbmc/pre_processed\pbmc_12k.h5ad
Processed 0 out of 11990 cells...
Processed 1000 out of 11990 cells...
Processed 2000 out of 11990 cells...
Processed 3000 out of 11990 cells...
Processed 4000 out of 11990 cells...
Processed 5000 out of 11990 cells...
Processed 6000 out of 11990 cells...
Processed 7000 out of 11990 cells...
Processed 8000 out of 11990 cells...
Processed 9000 out of 11990 cells...
Processed 10000 out of 11990 cells...
Processed 11000 out of 11990 cells...
Failed to fetch embeddings from OpenAi. Attempt 1: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Failed to fetch embeddings from OpenAi. Attempt 1: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Failed to fetch embeddings from OpenAi. Attempt 1: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)
Embedd