# Truncate Cell Embedding using Only Top N Genes

In [2]:
import scanpy as sc

adata_12k = sc.read_h5ad('../pbmc/pre_processed/pbmc_12k.h5ad')
print(adata_12k)

AnnData object with n_obs × n_vars = 11990 × 3346
    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'cell_type', 'batch_key', 'original_n_counts'
    var: 'key_0', 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts', 'ensembl_id', 'gene_symbol', 'entrez_id', 'refseq_id'
    uns: 'cell_types'
    obsm: 'design', 'normalized_qc', 'qc_pc', 'raw_qc'




In [3]:
adata_59k = sc.read_h5ad('../pbmc/pre_processed/pbmc_59k.h5ad')
print(adata_59k)

AnnData object with n_obs × n_vars = 59506 × 23948
    obs: 'cluster', 'n_features', 'mito_pct', 'Annotation', 'rank', 'donor_id', 'time_point', 'age', 'who_max', 'who_d0', 'who_d3', 'who_d7', 'who_d28', 'cardiacevent_72h', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'preexisting_heartdisease', 'preexisting_lungdisease', 'preexisting_kidneydisease', 'preexisting_diabetes', 'preexisting_hypertension', 'preexisting_immunocompromisedcondition', 'respiratory_symptoms', 'fever_symptoms', 'gastrointestinal_symptoms', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'batch_key', 'n_counts'
    var: 'key_0', 'author_feature_name', 'feature_is_filtered', 'feature



In [4]:
adata_67k = sc.read_h5ad('../pbmc/pre_processed/pbmc_67k.h5ad')
print(adata_67k)

AnnData object with n_obs × n_vars = 66985 × 36263
    obs: 'nCount_RNA', 'nFeature_RNA', 'nCount_HTO', 'nFeature_HTO', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification.global', 'sample', 'donor_id', 'CHIP', 'LANE', 'ProjectID', 'MUTATION', 'MUTATION.GROUP', 'sex_ontology_term_id', 'HTOID', 'percent.mt', 'nCount_SCT', 'nFeature_SCT', 'scType_celltype', 'pANN', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'is_primary_data', 'tissue_type', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'disease_ontology_term_id', 'Clone', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'batch_key', 'n_counts'
    var: 'key_0', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'ensembl_id', 'gene_symbol', 'entrez_id', 'refseq_id



In [25]:
import scanpy as sc
import os


def truncate_top_genes(adata, n):
    gene_expression_sum = adata.X.sum(axis=0).A1
    top_genes_idx = gene_expression_sum.argsort()[-n:][::-1]
    return adata[:, top_genes_idx]


directory = '../pbmc/pre_processed'
output_directory = '../pbmc_truncated_genes/pre_processed_200'
os.makedirs(output_directory, exist_ok=True)
for filename in os.listdir(directory):
    print(f'Processing {filename}')
    file_path = os.path.join(directory, filename)
    adata = sc.read_h5ad(file_path)
    adata = truncate_top_genes(adata, 200)
    adata.write_h5ad(os.path.join(output_directory, filename))
    print(adata)

Processing pbmc_12k.h5ad
View of AnnData object with n_obs × n_vars = 11990 × 200
    obs: 'n_counts', 'batch', 'labels', 'str_labels', 'cell_type', 'batch_key', 'original_n_counts'
    var: 'key_0', 'gene_symbols', 'n_counts-0', 'n_counts-1', 'n_counts', 'ensembl_id', 'gene_symbol', 'entrez_id', 'refseq_id'
    uns: 'cell_types'
    obsm: 'design', 'normalized_qc', 'qc_pc', 'raw_qc'
Processing pbmc_59k.h5ad
View of AnnData object with n_obs × n_vars = 59506 × 200
    obs: 'cluster', 'n_features', 'mito_pct', 'Annotation', 'rank', 'donor_id', 'time_point', 'age', 'who_max', 'who_d0', 'who_d3', 'who_d7', 'who_d28', 'cardiacevent_72h', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'preexisting_heartdisease', 'preexisting_lungdisease', 'preexisting_kidneydisease', 'preexisting_diabetes', 'preexisting_hypertension', 'preexisting_immunocompromisedcondition', 'respiratory_symptoms', 'fever_symptoms', 'gastrointestina

In [29]:
# Generate cell embeddings with Geneformer.
"""
Geneformer configs
"""
geneformer_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_truncated_genes/pre_processed_500',
    # The output tokenized file directory.
    tokenized_file_directory="../pbmc_truncated_genes/tokenized_500",
    # The output tokenized filename prefix.
    tokenized_file_prefix='geneformer',
    # The output embedding file directory.
    embedding_output_directory="../pbmc_truncated_genes/",
    # The output embedding file name.
    embedding_output_filename="geneformer_cell_embeddings_truncated_top_500_genes",
    # Directory of the Geneformer pre-trained model.
    load_model_dir="../embedding_extractors/models/geneformer/model/",
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key', 'n_counts'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("Geneformer", output_file_type='h5ad', configs=geneformer_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Tokenizing ..\pbmc_truncated_genes\pre_processed_500\pbmc_12k.h5ad
..\pbmc_truncated_genes\pre_processed_500\pbmc_12k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_500\pbmc_59k.h5ad
..\pbmc_truncated_genes\pre_processed_500\pbmc_59k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_500\pbmc_67k.h5ad
..\pbmc_truncated_genes\pre_processed_500\pbmc_67k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map: 100%|██████████| 138481/138481 [00:19<00:00, 7261.80 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 138481/138481 [00:00<00:00, 1798451.25 examples/s]


Tokenization completed for Geneformer.
Extracting Geneformer embeddings


100%|██████████| 13849/13849 [32:22<00:00,  7.13it/s] 


Output embedding in ../pbmc_truncated_genes/geneformer_cell_embeddings_truncated_top_500_genes.h5ad



In [30]:
# Generate cell embeddings with Geneformer.
"""
Geneformer configs
"""
geneformer_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_truncated_genes/pre_processed_1k',
    # The output tokenized file directory.
    tokenized_file_directory="../pbmc_truncated_genes/tokenized_1k",
    # The output tokenized filename prefix.
    tokenized_file_prefix='geneformer',
    # The output embedding file directory.
    embedding_output_directory="../pbmc_truncated_genes/",
    # The output embedding file name.
    embedding_output_filename="geneformer_cell_embeddings_truncated_top_1k_genes",
    # Directory of the Geneformer pre-trained model.
    load_model_dir="../embedding_extractors/models/geneformer/model/",
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key', 'n_counts'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("Geneformer", output_file_type='h5ad', configs=geneformer_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Tokenizing ..\pbmc_truncated_genes\pre_processed_1k\pbmc_12k.h5ad
..\pbmc_truncated_genes\pre_processed_1k\pbmc_12k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_1k\pbmc_59k.h5ad
..\pbmc_truncated_genes\pre_processed_1k\pbmc_59k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_1k\pbmc_67k.h5ad
..\pbmc_truncated_genes\pre_processed_1k\pbmc_67k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map: 100%|██████████| 138481/138481 [00:36<00:00, 3791.86 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 138481/138481 [00:00<00:00, 1064392.27 examples/s]


Tokenization completed for Geneformer.
Extracting Geneformer embeddings


100%|██████████| 13849/13849 [56:15<00:00,  4.10it/s] 


Output embedding in ../pbmc_truncated_genes/geneformer_cell_embeddings_truncated_top_1k_genes.h5ad



In [31]:
# Generate cell embeddings with Geneformer.
"""
Geneformer configs
"""
geneformer_configs = dict(
    # The directory to store preprocessed data.
    preprocessed_data_directory='../pbmc_truncated_genes/pre_processed_3k',
    # The output tokenized file directory.
    tokenized_file_directory="../pbmc_truncated_genes/tokenized_3k",
    # The output tokenized filename prefix.
    tokenized_file_prefix='geneformer',
    # The output embedding file directory.
    embedding_output_directory="../pbmc_truncated_genes/",
    # The output embedding file name.
    embedding_output_filename="geneformer_cell_embeddings_truncated_top_3k_genes",
    # Directory of the Geneformer pre-trained model.
    load_model_dir="../embedding_extractors/models/geneformer/model/",
    # List of cell attribute labels to keep, i.e. `cell_type` and `batch_key`. If none, use empty list [].
    custom_cell_attr_names=['cell_type', 'batch_key', 'n_counts'],
)

from embedding_extractors import EmbeddingExtractor

emb_extractor = EmbeddingExtractor("Geneformer", output_file_type='h5ad', configs=geneformer_configs)
emb_extractor.tokenize()
emb_extractor.extract_embeddings()

Tokenizing ..\pbmc_truncated_genes\pre_processed_3k\pbmc_12k.h5ad


100%|██████████| 24/24 [00:00<00:00, 33.00it/s]


..\pbmc_truncated_genes\pre_processed_3k\pbmc_12k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_3k\pbmc_59k.h5ad
..\pbmc_truncated_genes\pre_processed_3k\pbmc_59k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Tokenizing ..\pbmc_truncated_genes\pre_processed_3k\pbmc_67k.h5ad
..\pbmc_truncated_genes\pre_processed_3k\pbmc_67k.h5ad has no column attribute 'filter_pass'; tokenizing all cells.
Creating dataset.


Map: 100%|██████████| 138481/138481 [00:53<00:00, 2607.52 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 138481/138481 [00:00<00:00, 723904.65 examples/s]


Tokenization completed for Geneformer.
Extracting Geneformer embeddings


100%|██████████| 13849/13849 [51:18<00:00,  4.50it/s]  


Output embedding in ../pbmc_truncated_genes/geneformer_cell_embeddings_truncated_top_3k_genes.h5ad



In [32]:
import pandas as pd
import glob
from pathlib import Path

directory = '../pbmc_truncated_genes/*.h5ad'
for file_path in glob.glob(directory):
    print(f'Processing {file_path}')
    cell_emb = sc.read_h5ad(file_path)
    print(cell_emb.obsm['X_Geneformer'].shape)
    output_df = pd.concat([pd.DataFrame(cell_emb.obs['batch_key']).reset_index(drop=True),
                           pd.DataFrame(cell_emb.obs['n_counts']).reset_index(drop=True),
                           pd.DataFrame(cell_emb.obsm['X_Geneformer']).reset_index(drop=True)], axis=1)
    output_df.to_csv(Path(file_path).with_suffix('.csv'))

Processing ../pbmc_truncated_genes\geneformer_cell_embeddings_truncated_top_1k_genes.h5ad
(138481, 896)
Processing ../pbmc_truncated_genes\geneformer_cell_embeddings_truncated_top_200_genes.h5ad
(138481, 896)
Processing ../pbmc_truncated_genes\geneformer_cell_embeddings_truncated_top_3k_genes.h5ad
(138481, 896)
Processing ../pbmc_truncated_genes\geneformer_cell_embeddings_truncated_top_500_genes.h5ad
(138481, 896)
