# Example state embeddings

In [1]:
from helical.models.state import stateEmbeddingsModel
from helical.models.state import stateConfig

state_config = stateConfig()

state_embed = stateEmbeddingsModel(configurer = state_config)
processed_data = state_embed.process_data(ann_data_path="path/to/data.h5ad")
# embeddings before perturbation 
embeddings = state_embed.get_embeddings(processed_data)

INFO:datasets:PyTorch version 2.6.0 available.
INFO:datasets:Polars version 1.33.0 available.
Fetching 3 files: 100%|██████████| 3/3 [00:00<00:00, 5336.26it/s]
INFO:helical.models.state.state_embeddings:Using model checkpoint: /home/rasched/.cache/helical/models/state/se600m_epoch16.ckpt
INFO:helical.models.state.state_embeddings:Created output directory: /home/rasched/.cache/helical/models/state/SE-600M_MODEL


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'path/to/data.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

# Example state transition model inference

In [None]:
from helical.models.state import stateTransitionModel
import scanpy as sc

state_transition = stateTransitionModel(configurer=state_config)

adata = sc.read_h5ad("example_data.h5ad")
adata = state_transition.process_data(adata)
# embeddings after perturbation 
adata = state_transition.get_embeddings(adata)

# Example finetuning head on ST model

In [None]:
from helical.models.state import stateFineTuningModel

scgpt_fine_tune = stateFineTuningModel(configurer = state_config, fine_tuning_head = "classification", output_size = 2) 
data = scgpt_fine_tune.process_data("input_dict")
scgpt_fine_tune.train()

# Creating a Virtual Cell Challenge Submission using Helical

In [None]:
'''
Download the dataset

(taken from Colab Notebook by Adduri et al.
https://colab.research.google.com/drive/1QKOtYP7bMpdgDJEipDxaJqOchv7oQ-_l#scrollTo=h0aSjKX7Rtyw)
'''

import requests
from tqdm.auto import tqdm  # picks the best bar for the environment
from zipfile import ZipFile
from tqdm.auto import tqdm
import os

# Download the Replogle-Nadig training dataset.
url = "https://storage.googleapis.com/vcc_data_prod/datasets/state/competition_support_set.zip"
output_path = "competition_support_set.zip"

# stream the download so we can track progress
response = requests.get(url, stream=True)
total = int(response.headers.get("content-length", 0))

with open(output_path, "wb") as f, tqdm(
    total=total, unit='B', unit_scale=True, desc="Downloading"
) as bar:
    for chunk in response.iter_content(chunk_size=8192):
        if not chunk:
            break
        f.write(chunk)
        bar.update(len(chunk))

out_dir  = "competition_support_set"
os.makedirs(out_dir, exist_ok=True)
with ZipFile(output_path, 'r') as z:
    for member in tqdm(z.infolist(), desc="Unzipping", unit="file"):
        z.extract(member, out_dir)

In [None]:
# train the model on the training data
from helical.models.state import stateTransitionTrainModel
from helical.models.state.train_configs import trainingConfig

train_config = trainingConfig(
    toml_config_path="competition_support_set/starter.toml",
    num_workers=4,
    batch_col="batch_var",
    pert_col="target_gene",
    cell_type_key="cell_type",
    control_pert="non-targeting",
    perturbation_features_file="competition_support_set/ESM2_pert_features.pt",
    max_steps=40000,
    ckpt_every_n_steps=20000,
    model="state")

state_train = stateTransitionTrainModel(configurer = train_config)
state_train.train() 
state_train.predict() 

Once the model is trained we can perform inference on a new dataset as done with the class before

In [None]:
from helical.models.state import stateConfig
from helical.models.state import stateTransitionModel
import scanpy as sc

state_config = stateConfig(
    output = "competition/prediction.h5ad",
    model_dir = "competition/first_run",
    checkpoint = "competition/first_run/checkpoints/final.ckpt",
    pert_col = "target_gene",
    embed_key = None,
    celltype_col = None,
    celltypes = None,
    batch_col = None,
    control_pert = None,
    seed = 42,
    max_set_len = None,
    tsv = None
)

adata = sc.read_h5ad("competition_support_set/competition_val_template.h5ad")

state_transition = stateTransitionModel(configurer=state_config)
adata = state_transition.process_data(adata)
embeds = state_transition.get_embeddings(adata)

Now we can evaluate the model

In [None]:
# evaluate the model - underlying function uses cell-eval package 
# (https://github.com/ArcInstitute/cell-eval)
from helical.models.state import vcc_eval

# default configs for competition dataset
EXPECTED_GENE_DIM = 18080
MAX_CELL_DIM = 100000
DEFAULT_PERT_COL = "target_gene"
DEFAULT_CTRL = "non-targeting"
DEFAULT_COUNTS_COL = "n_cells"
DEFAULT_CELLTYPE_COL = "celltype"
DEFAULT_NTC_NAME = "non-targeting"

configs = {
    # path to the prediction file
    "input": "competition/prediction.h5ad",
    # path to the gene names file
    "genes": "competition_support_set/gene_names.csv",
    # path to the output file - if None will be created with default naming
    "output": None,
    "pert_col": DEFAULT_PERT_COL,
    "celltype_col": None,
    "ntc_name": DEFAULT_NTC_NAME,
    "output_pert_col": DEFAULT_PERT_COL,
    "output_celltype_col": DEFAULT_CELLTYPE_COL,
    "encoding": 32,
    "allow_discrete": False,
    "expected_gene_dim": EXPECTED_GENE_DIM,
    "max_cell_dim": MAX_CELL_DIM,
}

# this creates a submission file in the output directory which can be uploaded to the challenge leaderboard
vcc_eval(configs)