The combination of `./ko_to_adata.ipynb` and `./dict_to_df.ipynb` into one file to analyze the outputs of the in silico perturber using the CLCancer pre-trained model
- `.obsm["orig_embedding"]`: original embeddings for each cell
- `.uns["knockout_embedddings"]`: dictionary of genes to the embedding corresponding to their knockout
- `.uns["embedding_differences"]`: difference in embedding for original and per gene knockout

In [1]:
import sys
import os

sys.path.append('/work/magroup/kaileyhu/Geneformer')

import torch
import pandas as pd 
import numpy as np
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import pickle

# regression / optimizer imports
import torch.nn as nn
import torch.optim as optim

from geneformer import EmbExtractor

from anndata.experimental.pytorch import AnnLoader

pd.options.mode.chained_assignment = None # suppress copy warnings for df

2024-11-02 11:44:20.742561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-02 11:44:20.867008: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-02 11:44:20.888892: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-02 11:44:21.121294: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
adata = sc.read_h5ad("/work/magroup/kaileyhu/datasets/depmap/processed/hvg/omics_expr_hvg_500.h5ad")

In [5]:
n_cells = len(adata.X)
n_genes = len(adata.X[0]) # = 500 HVGs
embedding_dim = 512 # CLCancer has embedding 512
n_classes = 71

In [4]:
# make df match adata
def match_adata(df, adata, is_sub):
    for patient in df.index:
        if patient not in adata.obs_names:
            df.drop(patient, axis = 1)
            
    df = df[~df.index.duplicated(keep='first')]
    
    for patient in adata.obs_names:
        if patient not in df.index:
            if is_sub:  
                df.loc[patient] = 0
                
            else:
                df.loc[patient] = adata.obsm["orig_embedding"].loc[patient]# pd.Series(dtype='float64')
        
        if (is_sub):
            df.loc[patient] = df.loc[patient].fillna(0)
    
    df = df.reindex(adata.obs_names)
    return df

In [11]:
# obtain the original embeddings first

# initiate EmbExtractor

embex = EmbExtractor(model_type="CellClassifier",
                     num_classes=n_classes,
                     emb_mode="cls",
                     emb_layer=-1,
                     emb_label=["patient_id"],
                     forward_batch_size=100,
                     max_ncells=n_cells,
                     nproc=16)

print(f"Embedding extractor initialized...")
path_to_Geneformer = "../../../../Geneformer"

# extracts embedding from input data
embs = embex.extract_embs(f"{path_to_Geneformer}/gf-12L-95M-i4096_CLcancer",
                          "/work/magroup/kaileyhu/res/hvg_500_tokenized.dataset",
                          "/work/magroup/kaileyhu/res/via_classifier/",
                          "orig_embedding_CLcancer")

Embedding extractor initialized...


Loading cached sorted indices for dataset at /tmp/tmpotjdmp2l/work/magroup/kaileyhu/res/hvg_500_tokenized.dataset/cache-dc0a747f6c6857af.arrow
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../../../../Geneformer/gf-12L-95M-i4096_CLcancer and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/15 [00:00<?, ?it/s]

In [14]:
embs.to_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding_CLcancer.csv")

In [15]:
orig_embedding = pd.read_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding_CLcancer.csv")
orig_embedding.set_index("patient_id", inplace=True)
orig_embedding.drop(columns = ["Unnamed: 0"], inplace=True)
orig_embedding = match_adata(orig_embedding, adata, False)

print("Original embedding has NAN values?", orig_embedding.isna().any().any())

Original embedding has NAN values? False


In [16]:
adata.obsm["orig_embedding"] = orig_embedding

### Add knockout and subtracted embedding files

In [25]:
ko_file_dir = "/work/magroup/kaileyhu/res/cancer/perturbed_embs/"
diff_file_dir = "/work/magroup/kaileyhu/res/cancer/subtracted_embs/"
output_dir = "/work/magroup/kaileyhu/res/via_classifier/CLcancer"

In [18]:
ko_files = os.listdir(ko_file_dir) # get all perturbed csv's
print("Total files:", len(ko_files))

Total files: 443


In [20]:
ko_embs = {}
for file in tqdm(ko_files):
    df_file = pd.read_csv(ko_file_dir+file)
    df_file.set_index("patient_id", inplace=True)
    df_file = match_adata(df_file, adata, False)
    ad_name = "gene_"+file.split('.')[0]

    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        break
        
    ko_embs[ad_name] = df_file
    
adata.uns["knockout_embeddings"] = ko_embs

100%|██████████| 443/443 [01:31<00:00,  4.86it/s]


In [22]:
diff_files = os.listdir(diff_file_dir) # get all perturbed csv's
print("Total files:", len(diff_files))

Total files: 443


In [26]:
sub_embs = {}
via_dict = {}

for file in tqdm(diff_files):
    df_file = pd.read_csv(diff_file_dir+file)
    df_file.set_index("patient_id", inplace=True)

    via_scores = df_file['viability']
    via_scores = via_scores[~via_scores.index.duplicated(keep='first')]

    df_file = df_file.drop('viability', axis=1)

    original_idx = df_file.index
    
    df_file = match_adata(df_file, adata, True)
    ad_name = "gene_"+file.split('.')[0]

    
    for patient in adata.obs_names:
        if (patient in original_idx):
            if (not np.isnan(via_scores.loc[patient])):
                via_dict[(patient, ad_name)] = via_scores.loc[patient]
    
    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        print(df_file)
        break
        
    sub_embs[ad_name] = df_file
    
adata.uns["embedding_differences"] = sub_embs

100%|██████████| 443/443 [02:47<00:00,  2.65it/s]


In [27]:
adata.uns["viability_dict"] = via_dict

In [28]:
# remove viability column
for key in tqdm(adata.uns["embedding_differences"]):
    df = adata.uns["embedding_differences"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

for key in tqdm(adata.uns["knockout_embeddings"]):
    df = adata.uns["knockout_embeddings"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

100%|██████████| 443/443 [00:00<00:00, 39273.67it/s]
100%|██████████| 443/443 [00:00<00:00, 38553.31it/s]


In [30]:
f"{output_dir}/full_emb_obj_CLcancer.h5ad"

'/work/magroup/kaileyhu/res/via_classifier/CLcancer/full_emb_obj_CLcancer.h5ad'

In [None]:
# Not actually saving this because we don't need the full object

adata.write_h5ad(f"{output_dir}/full_emb_obj_CLcancer.h5ad", compression='gzip') #

### Convert the adata object into a dataframe

In [36]:
sub_embs = adata.uns['embedding_differences']
viability = adata.uns['viability_dict']

emb_file = f"{output_dir}/emb_diffs_CLcancer.pkl"
via_file = f"{output_dir}/viability_dict_CLcancer.pkl"

In [34]:
# pickle dump it
filehandler = open(emb_file, "wb")
pickle.dump(sub_embs, filehandler)
filehandler.close()

filehandler = open(via_file, "wb")
pickle.dump(viability, filehandler)
filehandler.close()

### Set up final dataframe

In [37]:
diff_dict = {}

for gene in tqdm(sub_embs):
    df_diff = sub_embs[gene]
    for patient in df_diff.index:
        if (patient, gene) in viability:
            temp = (df_diff.loc[patient].tolist())
            temp.append(viability[(patient, gene)])
            diff_dict[(patient, gene)] = temp

100%|██████████| 443/443 [00:13<00:00, 32.60it/s]


In [38]:
df = pd.DataFrame.from_dict(diff_dict, orient='index')

In [40]:
df.rename(columns = {512 : "viability score"}, inplace = True)

In [None]:
df

In [42]:
df.to_csv(f"{output_dir}/gene_patient_emb_mat_CLcancer.csv")