In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
import sys

import numpy as np
import pandas as pd

sys.path.append("..")
from src.Spectra.Spectra_Pert import vectorize_perts
from utils import (
    filter_noisy_genes,
    generate_k_fold,
    read_aws_h5ad,
)

### Get train/test split consistent with other models

In [None]:
# use anndata generate by ..data_processing/norman_prior_graph_preprocessing.ipynb
unfiltered_adata = read_aws_h5ad("path to preprocessed h5ad here")
adata = filter_noisy_genes(unfiltered_adata)
adata.layers["logcounts"] = adata.X.copy()
adata.X = adata.X.todense()
gene_network = adata.uns["sparse_gene_network"].todense()

In [4]:
# subset to powered perturbations
obs_df = pd.DataFrame(adata.obs["perturbation_name"])
category_counts = obs_df["perturbation_name"].value_counts()
filtered_categories = category_counts[category_counts >= 50].index
adata = adata[adata.obs["perturbation_name"].isin(filtered_categories)]

In [5]:
# retrieve same data splits for consistency
train_idx, val_idx, test_idx = generate_k_fold(
    adata,
    adata.X,
    adata.obs["perturbation_name"],
    fold_idx=0,
    perturbation_key="perturbation_name",
)

### Process GSFA-specifc inputs

In [6]:
# use Norman dataset from https://github.com/theislab/sc-pert
adata = read_aws_h5ad("path to h5ad here")

In [7]:
# subset to powered perturbations
obs_df = pd.DataFrame(adata.obs["perturbation_name"])
category_counts = obs_df["perturbation_name"].value_counts()
filtered_categories = category_counts[category_counts >= 50].index
adata = adata[adata.obs["perturbation_name"].isin(filtered_categories)]

In [None]:
# create binary perturbation matrix
D, pert_labels = vectorize_perts(adata, "perturbation_name", ["control", "nan"])
pert_idx = np.array(
    [
        adata.var_names.get_loc(i.split("_")[1])
        if i.split("_")[1] in adata.var_names
        else -1
        for i in pert_labels
    ]
)
# add ctrl one-hot-encoding
ctrl_vector = np.array(
    [1.0 if i == "control" else 0.0 for i in adata.obs["perturbation_name"]]
)
D = np.concatenate([D, ctrl_vector.reshape(len(ctrl_vector), 1)], axis=1).astype(
    np.float32
)
pert_idx = np.append(pert_idx, [-1, -1])
pert_labels = pert_labels + ["ctrl"]
print(D.shape)

In [10]:
# subset to kfold
adata_train = adata[train_idx]

In [11]:
# subset further for GSFA to run without OOM issues
from sklearn.model_selection import train_test_split

Y, _, G, _ = train_test_split(
    adata_train.layers["counts"],
    D[train_idx],
    test_size=0.70,
    random_state=42,
    stratify=D[train_idx],
)

In [12]:
# save inputs for GSFA
np.savez("norman_GSFA_inputs.npz", array1=Y.todense(), array2=G)

In [13]:
# save perturbation labels for downstream analysis
np.savez("norman_G_labels.npz", pert_labels)