In [1]:
# Imports

import sys
from pathlib import Path
import pandas as pd 
import numpy as np
import scanpy as sc
import anndata as ad


import os

from datasets import load_from_disk

import matplotlib.pyplot as plt
from matplotlib import rcParams
import h5py
from tqdm import tqdm
import pickle
import torch

#change to path to your Geneformer directory
sys.path.append('/work/magroup/kaileyhu/Geneformer')
from geneformer import EmbExtractor
from geneformer import TranscriptomeTokenizer

print("imports done")

2025-10-25 14:09:43.004213: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-25 14:09:43.296753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-25 14:09:43.418436: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-25 14:09:43.452427: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-25 14:09:43.667050: I tensorflow/core/platform/cpu_feature_guar

imports done


In [2]:
# Check torch device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Found device {device}")

Found device cuda:0


In [3]:
#files
metadata = pd.read_csv("/work/magroup/kaileyhu/datasets/depmap/metadata.csv")
omics_expr = pd.read_csv("/work/magroup/kaileyhu/datasets/depmap/OmicsExpressionProteinCodingGenesTPMLogp1.csv")

metadata.set_index('ModelID', inplace = True)
omics_expr.set_index("Unnamed: 0", inplace = True)

In [4]:
# import ensembl_df of gene -> ensembl id matrix

ensembl_path = "/work/magroup/kaileyhu/Geneformer/geneformer/ensembl_mapping_dict_gc95M.pkl"

def invert_dict(dict_obj):
    return {v: k for k, v in dict_obj.items()}

with open(ensembl_path, "rb") as f:
    id_gene_dict = pickle.load(f)
    gene_id_dict = invert_dict(id_gene_dict)

In [5]:
adata = ad.AnnData(omics_expr)
adata.obs_names = [str(i).split(" ")[0] for i in omics_expr.index]
adata.var_names = [str(i).split(" ")[0] for i in omics_expr.columns]

In [6]:
lst = []
genes = []

for gene in adata.var_names:
    gene2 = gene.split(" ")[0]
    if gene2 in id_gene_dict:
        lst.append(id_gene_dict[gene2])
        genes.append(gene2)
    else:
        lst.append(None)


filtered_results = []

res = []
for val in lst:
    if val is not None:
        filtered_results.append(val)

In [7]:
adata2 = adata[:,genes]
adata2.var_names = filtered_results
adata2.var['ensembl_id'] = filtered_results
adata2.obs['n_counts'] = adata2.X.sum(axis=1)

In [8]:
adata2.write_h5ad("/work/magroup/kaileyhu/datasets/depmap/processed/no_meta/omics_expr_no_metadata.h5ad",compression='gzip')

### get hvg (DON'T RUN UNLESS YOU WANT A NEW DATASET)

In [38]:
sc.pp.highly_variable_genes(adata2, n_top_genes=500, inplace=True)

In [39]:
adata_hvg = adata2[:, adata2.var["highly_variable"]]

In [40]:
adata_hvg.write_h5ad("/work/magroup/kaileyhu/datasets/depmap/processed/no_meta/hvg/omics_expr_hvg_500_no_metadata.h5ad",compression='gzip')

In [9]:
tk = TranscriptomeTokenizer({}, 
                            nproc=16,
                            special_token = False, #true for 95M
                            model_input_size=2048)  

<cls> and <eos> are in gene_token_dict but special_token = False. Please note that for 95M model series, special_token should be True.


In [10]:
tk.tokenize_data('/work/magroup/kaileyhu/datasets/depmap/processed/hvg/', 
                 "/work/magroup/kaileyhu/res/", 
                 "hvg_500_tokenized_2048_no_metadata", 
                 file_format="h5ad")

Tokenizing /work/magroup/kaileyhu/datasets/depmap/processed/hvg/omics_expr_hvg_500.h5ad
/work/magroup/kaileyhu/datasets/depmap/processed/hvg/omics_expr_hvg_500.h5ad has no column attribute 'filter_pass'; tokenizing all cells.


  for i in adata.var["ensembl_id_collapsed"][coding_miRNA_loc]
  coding_miRNA_ids = adata.var["ensembl_id_collapsed"][coding_miRNA_loc]


Creating dataset.


Map (num_proc=16):   0%|          | 0/1479 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1479 [00:00<?, ? examples/s]

### actually extract gene embeddings (only rly need this)

In [3]:
embex = EmbExtractor(model_type="Pretrained",
                     emb_mode="gene",
                     num_classes=0,
                     emb_layer=-1,
                     forward_batch_size=200,
                     nproc=16)

In [4]:
embs = embex.extract_embs("/work/magroup/kaileyhu/Geneformer/gf-12L-30M-i2048/",
                          "/work/magroup/kaileyhu/res/hvg_500_w_SL_tokenized_2048.dataset", # "/work/magroup/kaileyhu/res/hvg_500_tokenized_2048_no_metadata.dataset",
                          "/work/magroup/kaileyhu/res/",
                          "hvg_500_w_SL_tokenized_GENES" #"hvg_500_tokenized_2048_GENES"
                         )

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From ðŸ‘‰v4.50ðŸ‘ˆ onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
embs.to_hdf("/work/magroup/kaileyhu/res/gene_embeddings/geneformer.csv", "table")