In [None]:
# Core libraries
import os
import time
import scanpy as sc
import anndata as ad
import lamindb as ln
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
import torch

import scvi

# Tracking
project = ln.Project(name="Modlyn-LSCVI-Benchmark")
project.save()

ln.track(project="Modlyn-LSCVI-Benchmark")

run = ln.track()


In [None]:
!df -h


In [None]:
from modlyn.io.loading import read_lazy

# Path to chunk
store_path = Path("/home/ubuntu/tahoe100M_chunk_1")  # adjust if needed
adata = read_lazy(store_path)
adata.var = pd.read_parquet("var_subset_tahoe100M.parquet")

# Encode labels
adata.obs["y"] = adata.obs["cell_line"].astype("category").cat.codes.astype("int")
adata.obs["cell_line"] = adata.obs["cell_line"].astype("category")

# Subset
adata_train = adata[:80000].copy()
adata_val = adata[80000:100000].copy()


In [None]:
# Log-transform
sc.pp.log1p(adata_train)
adata_train.X = adata_train.X.compute()
adata_train.X = np.array(adata_train.X)


In [None]:
adata_train

## Train LinearSCVI & benchmark

In [None]:
top_cell_lines = adata_train.obs["cell_line"].value_counts().index[:50]
adata_filtered = adata_train[adata_train.obs["cell_line"].isin(top_cell_lines)].copy()
adata_sub = adata_filtered[np.random.choice(adata_filtered.n_obs, 2000, replace=False)].copy()

scvi.model.LinearSCVI.setup_anndata(adata_sub, labels_key="cell_line")



In [None]:
model = scvi.model.LinearSCVI(adata_sub, gene_likelihood="gaussian")
model.view_anndata_setup()


In [None]:
# from scvi.dataloaders import DataSplitter

# splitter = DataSplitter(adata_sub, train_size=1.0, validation_size=0.0, batch_size=64)
# splitter.setup()
# dl = splitter.train_dataloader()

# batches = list(dl)
# print(f"{len(batches)=}")
# print(f"Batch keys: {list(batches[0].keys()) if batches else 'EMPTY'}")


In [None]:
model.train()


In [None]:
print(model.get_loadings())
print(model.summary_string)


In [None]:
labels = adata_sub.obs["cell_line"].values
print("1")
# Z = model.get_latent_representation(batch_size=128)
# Z
import time
start = time.time()
Z = model.get_latent_representation(batch_size=128)
print(f"Elapsed: {time.time() - start:.2f} seconds")


In [None]:
labels_unique = np.unique(labels)

Z_mean = np.stack([Z[labels == k].mean(axis=0) for k in labels_unique])

# Project into gene space
W = model.get_loadings().values  # shape: genes × latent
weights = Z_mean @ W.T  # shape: cell_lines × genes

# Wrap up as DataFrame
weights_df = pd.DataFrame(
    weights,
    index=labels_unique,
    columns=model.adata.var_names
)
weights_df

In [None]:
# de = model.differential_expression(groupby="cell_line")


In [None]:
# start = time.time()
# model = scvi.model.LinearSCVI(adata_train, gene_likelihood="gaussian")
# model.train(max_epochs=50, early_stopping=False, plan_kwargs=dict(optimizer="Adam"))
# scvi_runtime = time.time() - start
# print(f"LinearSCVI training time: {scvi_runtime:.2f} seconds")


In [None]:
# model.history["elbo_train"].plot()


## Extract weights

In [None]:
from sklearn.preprocessing import minmax_scale

# Normalize weights (for plotting)
w_scaled = weights_df.clip(-np.percentile(np.abs(weights_df), 99), 
                           np.percentile(np.abs(weights_df), 99))
w_scaled = w_scaled / np.percentile(np.abs(w_scaled.values), 99)

# Certainty estimate → use abs(weight) as proxy (LinearSCVI doesn't output SE directly)
certainty = weights_df.abs()
certainty_scaled = pd.DataFrame(minmax_scale(certainty, axis=1),
                                index=certainty.index,
                                columns=certainty.columns)


In [None]:
certainty

In [None]:
adata_dot_lscvi = ad.AnnData(
    X=certainty_scaled.values,
    obs=pd.DataFrame(index=certainty_scaled.index),
    var=pd.DataFrame(index=certainty_scaled.columns)
)
adata_dot_lscvi.obs["cell_line"] = adata_dot_lscvi.obs.index
adata_dot_lscvi.obs_names = adata_dot_lscvi.obs.index
adata_dot_lscvi.var_names = adata_dot_lscvi.var.index
adata_dot_lscvi.layers["weights_scaled"] = w_scaled.loc[adata_dot_lscvi.obs_names, adata_dot_lscvi.var_names].values


In [None]:
sc.pl.dotplot(
    adata_dot_lscvi,
    var_names=adata_dot_lscvi.var_names[:30],
    groupby="cell_line",
    layer="weights_scaled",
    cmap="RdBu_r",
    vcenter=0,
    dot_min=0.2,
    dot_max=1.0,
    smallest_dot=0.1,
    show=True
)

In [None]:
print((certainty.columns == adata_dot_lscvi.var_names).all())  # Should be True
adata_dot_lscvi.var_names = certainty.columns
print(dot_color[certainty.columns].shape)
# adata_dot_lscvi.var_names
# print(dot_color[certainty.columns].describe())
# print(lscvi_size.describe())


In [None]:
lscvi_size = pd.DataFrame(minmax_scale(certainty, axis=1),
                           index=certainty.index, columns=certainty.columns)

dot_color = pd.DataFrame(
    adata_dot_lscvi.layers["weights_scaled"],
    index=adata_dot_lscvi.obs_names,
    columns=adata_dot_lscvi.var_names
)

top_genes = certainty.columns[:30]  # or some handpicked list

sc.pl.dotplot(
    adata_dot_lscvi,
    var_names=top_genes,
    groupby="cell_line",
    dot_color_df=dot_color[top_genes],
    dot_size_df=lscvi_size[top_genes],
    cmap="RdBu_r",
    vcenter=0,
    dot_min=0.2,
    dot_max=1.0,
    smallest_dot=0.1,
    show=True
)
# sc.pl.dotplot(
#     adata_dot_lscvi,
#     var_names=certainty.columns,
#     groupby="cell_line",
#     dot_color_df=dot_color[certainty.columns],
#     dot_size_df=lscvi_size,
#     cmap="RdBu_r",
#     vcenter=0,
#     dot_min=0.2,
#     dot_max=1.0,
#     smallest_dot=0.1,
#     use_raw=False,  # ensure correct data source
#     show=True
# )


In [None]:
import psutil

def log_resource():
    process = psutil.Process(os.getpid())
    print(f"Memory usage: {process.memory_info().rss / 1e9:.2f} GB")

log_resource()