The combination of `./ko_to_adata.ipynb` and `./dict_to_df.ipynb` into one file to analyze the outputs of the in silico perturber using the regular pre-trained model
- `.obsm["orig_embedding"]`: original embeddings for each cell
- `.uns["knockout_embedddings"]`: dictionary of genes to the embedding corresponding to their knockout
- `.uns["embedding_differences"]`: difference in embedding for original and per gene knockout

In [None]:
import sys
import os

sys.path.append('/work/magroup/kaileyhu/Geneformer')

import torch
import pandas as pd 
import numpy as np
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import pickle

# regression / optimizer imports
import torch.nn as nn
import torch.optim as optim

from geneformer import EmbExtractor

pd.options.mode.chained_assignment = None # suppress copy warnings for df

In [None]:
adata = sc.read_h5ad("/work/magroup/kaileyhu/datasets/depmap/processed/omics_expr.h5ad")

In [None]:
adata

In [None]:
n_cells = len(adata.X)
n_genes = len(adata.X[0]) 
embedding_dim = 512 # pretrained has embedding 512
n_classes = 0

In [None]:
n_genes

In [None]:
# make df match adata
def match_adata(df, adata, is_sub):
    for patient in df.index:
        if patient not in adata.obs_names:
            print(f"Patient {patient} is missing")
            df.drop(patient, axis = 1)
            
    df = df[~df.index.duplicated(keep='first')]
    
    for patient in adata.obs_names:
        if patient not in df.index:
            if is_sub:  
                df.loc[patient] = 0
                
            else:
                df.loc[patient] = adata.obsm["orig_embedding"].loc[patient]# pd.Series(dtype='float64')
        
        if (is_sub):
            df.loc[patient] = df.loc[patient].fillna(0)
    
    df = df.reindex(adata.obs_names)
    return df

In [None]:
# obtain the original embeddings first

# initiate EmbExtractor

embex = EmbExtractor(model_type="Pretrained",
                     num_classes=n_classes,
                     emb_mode="cell",
                     emb_layer=-1,
                     emb_label=["patient_id"],
                     forward_batch_size=100,
                     max_ncells=n_cells,
                     nproc=16)

print(f"Embedding extractor initialized...")

# extracts embedding from input data
embs = embex.extract_embs(f"/work/magroup/kaileyhu/Geneformer/gf-12L-30M-i2048/",
                          "/work/magroup/kaileyhu/res/all_genes_tokenized_2048.dataset",
                          "/work/magroup/kaileyhu/res/via_classifier/",
                          "orig_embedding_pretrained")

In [None]:
embs.to_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding_2048.csv")

In [None]:
orig_embedding = pd.read_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding_2048.csv")
orig_embedding.set_index("patient_id", inplace=True)
orig_embedding.drop(columns = ["Unnamed: 0"], inplace=True)
orig_embedding = match_adata(orig_embedding, adata, False)

print("Original embedding has NAN values?", orig_embedding.isna().any().any())

In [None]:
adata.obsm["orig_embedding"] = orig_embedding

### Add knockout and subtracted embedding files

In [None]:
ko_file_dir = "/work/magroup/kaileyhu/res/gf_12L_30M_i2048/perturbed_embs/"
diff_file_dir = "/work/magroup/kaileyhu/res/gf_12L_30M_i2048/subtracted_embs/"
output_dir = "/work/magroup/kaileyhu/res/gf_12L_30M_i2048/pretrained"

In [None]:
ko_files = os.listdir(ko_file_dir) # get all perturbed csv's
print("Total files:", len(ko_files))

In [None]:
ko_embs = {}
for file in tqdm(ko_files):
    df_file = pd.read_csv(ko_file_dir+file)
    df_file.set_index("patient_id", inplace=True)
    df_file = match_adata(df_file, adata, False)
    ad_name = "gene_"+file.split('.')[0]

    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        break
        
    ko_embs[ad_name] = df_file
    
adata.uns["knockout_embeddings"] = ko_embs

In [None]:
diff_files = os.listdir(diff_file_dir) # get all perturbed csv's
print("Total files:", len(diff_files))

In [None]:
sub_embs = {}
via_dict = {}

for file in tqdm(diff_files):
    df_file = pd.read_csv(diff_file_dir+file)
    df_file.set_index("patient_id", inplace=True)

    via_scores = df_file['viability']
    via_scores = via_scores[~via_scores.index.duplicated(keep='first')]

    df_file = df_file.drop('viability', axis=1)

    original_idx = df_file.index
    
    df_file = match_adata(df_file, adata, True)
    ad_name = "gene_"+file.split('.')[0]

    
    for patient in adata.obs_names:
        if (patient in original_idx):
            if (not np.isnan(via_scores.loc[patient])):
                via_dict[(patient, ad_name)] = via_scores.loc[patient]
    
    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        print(df_file)
        break
        
    sub_embs[ad_name] = df_file
    
adata.uns["embedding_differences"] = sub_embs

In [None]:
adata.uns["viability_dict"] = via_dict

In [None]:
# remove viability column
for key in tqdm(adata.uns["embedding_differences"]):
    df = adata.uns["embedding_differences"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

for key in tqdm(adata.uns["knockout_embeddings"]):
    df = adata.uns["knockout_embeddings"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

In [None]:
f"{output_dir}/full_emb_obj_pretrained.h5ad"

### Convert the adata object into a dataframe

In [None]:
sub_embs = adata.uns['embedding_differences']
viability = adata.uns['viability_dict']

emb_file = f"{output_dir}/emb_diffs_pretrained.pkl"
via_file = f"{output_dir}/viability_dict_pretrained.pkl"

In [None]:
# pickle dump it
filehandler = open(emb_file, "wb")
pickle.dump(sub_embs, filehandler)
filehandler.close()

filehandler = open(via_file, "wb")
pickle.dump(viability, filehandler)
filehandler.close()

### Set up final dataframe

In [None]:
diff_dict = {}

for gene in tqdm(sub_embs):
    df_diff = sub_embs[gene]
    for patient in df_diff.index:
        if (patient, gene) in viability:
            temp = (df_diff.loc[patient].tolist())
            temp.append(viability[(patient, gene)])
            diff_dict[(patient, gene)] = temp

In [None]:
df = pd.DataFrame.from_dict(diff_dict, orient='index')

In [None]:
df.rename(columns = {512 : "viability score"}, inplace = True)

In [None]:
df

In [None]:
df.to_csv(f"{output_dir}/gene_patient_emb_mat_pretrained.csv")