### Anndata object for regression classifier from ISP gene knockouts
Things I'm adding to the original HVG-related adata:
- `.obsm["orig_embedding"]`: original embeddings for each cell
- `.uns["knockout_embedddings"]`: dictionary of genes to the embedding corresponding to their knockout
- `.uns["embedding_differences"]`: difference in embedding for original and per gene knockout

In [1]:
import sys
import os

sys.path.append('/work/magroup/kaileyhu/Geneformer')

import torch
import pandas as pd 
import numpy as np
import scanpy as sc
import anndata as ad
from tqdm import tqdm
import pickle

# regression / optimizer imports
import torch.nn as nn
import torch.optim as optim

from geneformer import EmbExtractor

from anndata.experimental.pytorch import AnnLoader

pd.options.mode.chained_assignment = None # suppress copy warnings for df

In [2]:
adata = sc.read_h5ad("/work/magroup/kaileyhu/datasets/depmap/processed/hvg/omics_expr_hvg_500.h5ad")

In [3]:
n_cells = len(adata.X)
n_genes = len(adata.X[0]) # = 500 HVGs
embedding_dim = 256

In [4]:
# make df match adata
def match_adata(df, adata, is_sub):
    for patient in df.index:
        if patient not in adata.obs_names:
            df.drop(patient, axis = 1)
            
    df = df[~df.index.duplicated(keep='first')]
    
    for patient in adata.obs_names:
        if patient not in df.index:
            if is_sub:  
                df.loc[patient] = 0
                
            else:
                df.loc[patient] = adata.obsm["orig_embedding"].loc[patient]# pd.Series(dtype='float64')
        
        if (is_sub):
            df.loc[patient] = df.loc[patient].fillna(0)
    
    df = df.reindex(adata.obs_names)
    return df

In [None]:
# obtain the original embeddings first

# initiate EmbExtractor
embex = EmbExtractor(model_type="Pretrained",
                     num_classes=0,
                     emb_mode="cell",
                     emb_layer=-1,
                     emb_label=["patient_id"],
                     forward_batch_size=100, # 200
                     max_ncells=n_cells,
                     nproc=16)

# extracts embedding from input data
embs = embex.extract_embs("/work/magroup/kaileyhu/Geneformer/geneformer-12L-30M/",
                          "/work/magroup/kaileyhu/res/hvg_500_tokenized.dataset",
                          "/work/magroup/kaileyhu/res/via_classifier/",
                          "orig_embedding")

In [None]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.__version__)         # Check the CUDA version PyTorch is using
# print(torch.cuda.get_device_name(0)) 

In [None]:
embs.to_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding.csv")

In [5]:
orig_embedding = pd.read_csv("/work/magroup/kaileyhu/res/via_classifier/orig_embedding.csv")
orig_embedding.set_index("patient_id", inplace=True)
orig_embedding.drop(columns = ["Unnamed: 0"], inplace=True)
orig_embedding = match_adata(orig_embedding, adata, False)

print("Original embedding has NAN values?", orig_embedding.isna().any().any())

Original embedding has NAN values? False


In [6]:
adata.obsm["orig_embedding"] = orig_embedding

In [None]:
adata

### Add knockout files

In [9]:
ko_files = os.listdir("/work/magroup/kaileyhu/res/perturbed_embs/") # get all perturbed csv's
print("Total files:", len(ko_files))

Total files: 436


In [10]:
ko_embs = {}
for file in tqdm(ko_files):
    df_file = pd.read_csv("/work/magroup/kaileyhu/res/perturbed_embs/"+file)
    df_file.set_index("patient_id", inplace=True)
    df_file = match_adata(df_file, adata, False)
    ad_name = "gene_"+file.split('.')[0]

    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        break
        
    ko_embs[ad_name] = df_file
    
adata.uns["knockout_embeddings"] = ko_embs

100%|██████████| 436/436 [03:15<00:00,  2.22it/s]


In [7]:
diff_files = os.listdir("/work/magroup/kaileyhu/res/perturbed_embs/") # get all perturbed csv's
print("Total files:", len(diff_files))

Total files: 436


In [8]:
sub_embs = {}
via_dict = {}

for file in tqdm(diff_files):
    df_file = pd.read_csv("/work/magroup/kaileyhu/res/subtracted_embs/"+file)
    df_file.set_index("patient_id", inplace=True)

    via_scores = df_file['viability']
    via_scores = via_scores[~via_scores.index.duplicated(keep='first')]

    df_file = df_file.drop('viability', axis=1)

    original_idx = df_file.index
    
    df_file = match_adata(df_file, adata, True)
    ad_name = "gene_"+file.split('.')[0]

    
    for patient in adata.obs_names:
        if (patient in original_idx):
            if (not np.isnan(via_scores.loc[patient])):
                via_dict[(patient, ad_name)] = via_scores.loc[patient]
    
    if (df_file.isna().any().any()):
        print("Error: dataframe for", ad_name, "contains NAN values")
        print(df_file)
        break
        
    sub_embs[ad_name] = df_file
    
adata.uns["embedding_differences"] = sub_embs

100%|██████████| 436/436 [05:30<00:00,  1.32it/s]


In [9]:
adata.uns["viability_dict"] = via_dict

In [10]:
# remove viability column
for key in tqdm(adata.uns["embedding_differences"]):
    df = adata.uns["embedding_differences"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

for key in tqdm(adata.uns["knockout_embeddings"]):
    df = adata.uns["knockout_embeddings"][key]
    if ("viability" in df.columns):
        df.drop(columns = ["viability"], inplace=True)

100%|██████████| 436/436 [00:00<00:00, 34584.35it/s]


KeyError: 'knockout_embeddings'

In [15]:
adata

AnnData object with n_obs × n_vars = 1479 × 500
    obs: 'patient_id', 'cell_line', 'disease', 'disease_state', 'n_counts', 'cell_type', 'batch'
    var: 'ensembl_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'knockout_embeddings', 'embedding_differences', 'viability_dict'
    obsm: 'orig_embedding'

In [None]:
adata.write_h5ad("/work/magroup/kaileyhu/res/via_classifier/full_emb_obj.h5ad") #,compression='gzip'

### Convert to dataframe

In [11]:
emb_diffs = adata.uns['embedding_differences']
via_scores = adata.uns['viability_dict']

In [13]:
# pickle dump it
filehandler = open("/work/magroup/kaileyhu/res/via_classifier/emb_diffs.pkl","wb")
pickle.dump(emb_diffs, filehandler)
filehandler.close()

filehandler = open("/work/magroup/kaileyhu/res/via_classifier/viability_dict.pkl","wb")
pickle.dump(via_scores, filehandler)
filehandler.close()

In [None]:
diff_dict = {}

for gene in tqdm(emb_diffs):
    df_diff = emb_diffs[gene]
    for patient in df_diff.index:
        diff_dict[(patient, gene)] = df_diff.loc[patient].tolist()

 78%|███████▊  | 342/436 [00:20<00:05, 18.34it/s]

### Looking at properties of adata object (# of nan's)

In [None]:
adata = sc.read_h5ad("/work/magroup/kaileyhu/res/via_classifier/full_emb_obj.h5ad")

In [None]:
emb_diffs = adata.uns["embedding_differences"]
patients = adata.obs_names.to_list()

for g in bad_genes:
    emb_diffs.remove(g)

for key in tqdm(emb_diffs):
    if (len(patients) == 0):
        break
    df = emb_diffs[key]
    for patient in patients:
        row = df.loc[patient]
        is_nan = row.isna().any()
        if (is_nan):
            patients.remove(patient)
            if key not in bad_genes:
                bad_genes.append(key)

In [None]:
len(bad_genes)

### Transform data (start here!)

In [None]:
adata.uns['knockout_embeddings']

In [None]:
# reshape matrices

for key in tqdm(adata.uns["embedding_differences"]):
    df = adata.uns["embedding_differences"][key]
    flattened_embedding_differences = df.to_numpy().reshape(n_cells, n_genes * embedding_dim)
    adata.uns["embedding_differences"][key] = flattened_embedding_differences

In [None]:
adata.obsm["orig_embedding"]