In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import sys

import numpy as np
import pandas as pd

sys.path.append("..")
from src.Spectra.Spectra_Pert import vectorize_perts
from utils import (
    filter_noisy_genes,
    generate_k_fold,
    inhouse_preprocess,
    read_aws_h5ad,
)

### Get train/test splits consistent with other models

In [None]:
# use anndata generate by ..data_processing/inhouse_prior_graph_preprocessing.ipynb
unfilterd_adata = read_aws_h5ad("path to preprocessed h5ad here")
adata = filter_noisy_genes(unfilterd_adata)
adata = inhouse_preprocess(adata)
adata.layers["logcounts"] = adata.X.copy()
adata.X = adata.X.todense()
gene_network = adata.uns["sparse_gene_network"].todense()

In [4]:
# powered perturbations
adata.obs["condition"] = adata.obs["condition"].astype(str)
adata.obs["Treatment"] = adata.obs["Treatment"].astype(str)
adata.obs["pert_treat"] = adata.obs["condition"] + "+" + adata.obs["Treatment"]
obs_df = pd.DataFrame(adata.obs["pert_treat"])
category_counts = obs_df["pert_treat"].value_counts()
filtered_categories = category_counts[category_counts >= 50].index
adata = adata[adata.obs["pert_treat"].isin(filtered_categories)]

In [5]:
train_idx, val_idx, test_idx = generate_k_fold(
    adata, adata.X, adata.obs["condition"], fold_idx=0
)

### Process GSFA-specifc input

In [None]:
# use inhouse dataset from s3://pert-spectra
adata = read_aws_h5ad(
    "s3://pert-spectra/rnaseq565.filtered.actionet.guide_corrected.h5ad"
)
adata = inhouse_preprocess(adata)

In [7]:
# filter adata to perturbations with at least 50 samples for each treatment
adata.obs["condition"] = adata.obs["condition"].astype(str)
adata.obs["Treatment"] = adata.obs["Treatment"].astype(str)
adata.obs["pert_treat"] = adata.obs["condition"] + "+" + adata.obs["Treatment"]
obs_df = pd.DataFrame(adata.obs["pert_treat"])
category_counts = obs_df["pert_treat"].value_counts()
filtered_categories = category_counts[category_counts >= 50].index
adata = adata[adata.obs["pert_treat"].isin(filtered_categories)]

In [None]:
# create binary perturbation matrix
D, pert_labels = vectorize_perts(adata, "condition", ["ctrl", "nan"])
pert_idx = np.array(
    [
        adata.var_names.get_loc(i.split("_")[1])
        if i.split("_")[1] in adata.var_names
        else -1
        for i in pert_labels
    ]
)
# add ctrl one-hot-encoding
ctrl_vector = np.array([1.0 if i == "ctrl" else 0.0 for i in adata.obs["condition"]])
D = np.concatenate([D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1).astype(
    np.float32
)
pert_idx = np.append(pert_idx, [-1, -1])
pert_labels = pert_labels + ["ctrl"]
print(D.shape)

In [9]:
# subset to kfold and TNFA+ treatment
D_train = D[train_idx]
adata_train = adata[train_idx]
D_train = D_train[adata_train.obs["Treatment"] == "TNFA+"]
adata_train = adata_train[adata_train.obs["Treatment"] == "TNFA+"]

In [13]:
# subset further for GSFA to run without OOM issues
from sklearn.model_selection import train_test_split

Y, _, G, _ = train_test_split(
    adata_train.layers["counts"],
    D_train,
    test_size=0.2,
    random_state=42,
    stratify=D_train,
)

In [14]:
# save inputs for GSFA
np.savez("rna565_GSFA_inputs.npz", array1=Y.todense(), array2=G)

In [12]:
# save additional perturbation labels for downstream analysis
np.savez("rna565_G_labels.npz", pert_labels)