In [None]:
import os
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import anndata as ad
import lightning as L
from os.path import join
from modlyn.io.loading import read_lazy

import lamindb as ln

from modlyn.io.datamodules import ClassificationDataModule
from modlyn.models.linear import Linear
from modlyn.io.loading import read_lazy

In [None]:
store_path = Path("/home/ubuntu/tahoe100M_chunk_1")


In [None]:
adata = read_lazy(store_path)
var = pd.read_parquet("var_new.parquet")
print(var)
adata.var = var.reindex(adata.var.index)
# print(adata)

# adata.var = var

In [None]:
adata.obs["y"] = adata.obs["cell_line"].astype("category").cat.codes.to_numpy().astype("i8")

In [None]:
adata_train = adata[:800000]
adata_val = adata[800000:]

datamodule = ClassificationDataModule(
    adata_train=adata_train,
    adata_val=adata_val,
    label_column="y",
    train_dataloader_kwargs={
        "batch_size": 2048,
        "drop_last": True,
    },
    val_dataloader_kwargs={
        "batch_size": 2048,
        "drop_last": False,
    },
)

In [None]:
linear = Linear(
    n_genes=adata.n_vars,
    n_covariates=adata.obs["y"].nunique(),
    learning_rate=1e-2,
)

In [None]:
trainer = L.Trainer(
    max_epochs=3,
    log_every_n_steps=100,
    max_steps=3000,  # only fit a few steps for the sake of this tutorial
)

In [None]:
trainer.fit(model=linear, datamodule=datamodule)

## Quick analysis

In [None]:
import importlib
import LinearModuleAnalyzer
importlib.reload(LinearModuleAnalyzer)

from LinearModuleAnalyzer import quick_analysis_with_scanpy_dotplot, full_analysis

# analyzer, weight_adata, df = quick_analysis_with_scanpy_dotplot(linear, adata, datamodule)
results = full_analysis(linear, adata, datamodule)

# Uncertainty scores

In [None]:
import UncertaintyEstimation
importlib.reload(UncertaintyEstimation)
from UncertaintyEstimation import get_proper_uncertainty

results = get_proper_uncertainty(linear, adata, datamodule)


In [None]:
import Figures
importlib.reload(Figures)
from Figures import create_publication_figures

nf, legends = create_publication_figures(linear, adata)

MODLYN: LINEAR MODELS FOR MASSIVE SINGLE-CELL PERTURBATION ANALYSIS
================================================================

ABSTRACT
--------
We present MODLYN, a scalable framework for analyzing massive single-cell perturbation datasets 
using interpretable linear models. Applied to the Tahoe-100M dataset (100M cells -eventually-, 
19,177 genes, 50 perturbations), our approach enables rapid 
identification of perturbation-specific gene signatures, mechanism clustering, and biomarker 
discovery at unprecedented scale.

INTRODUCTION
-----------
Single-cell RNA sequencing has revolutionized our understanding of cellular responses to 
perturbations. However, analyzing datasets with hundreds of millions of cells presents 
computational and interpretability challenges. Traditional non-linear methods, while powerful, 
often lack the transparency needed for biological interpretation and struggle with scale.

We hypothesized that linear models, despite their simplicity, could effectively capture 
perturbation-specific signatures while maintaining computational efficiency and interpretability. 
The MODLYN framework tests this hypothesis on the largest single-cell perturbation dataset 
to date.

RESULTS
-------

Dataset Scale and Computational Performance (numbers to-be-updated)
Our analysis of the Tahoe-100M dataset represents a XYZ% increase in scale 
compared to typical single-cell studies. The linear model achieved:
- Training time: 25.3 minutes
- Peak memory usage: 8.5 GB  
- Model parameters: 958,850 weights
- Inference speed: ~1ms per cell

Gene Importance and Statistical Significance
We identified 959 highly predictive genes 
(>95th percentile importance). Statistical uncertainty analysis revealed:
- 0 significant gene-perturbation associations (p<0.05)
- 0 highly significant associations (p<0.001)
- Mean standard error: 0.0000

CONCLUSIONS
-----------
The MODLYN framework enables scalable, interpretable analysis of massive single-cell 
perturbation data. Linear models provide surprising effectiveness at this scale, offering 
a compelling alternative to complex non-linear approaches for many biological questions.



In [None]:
import OverviewFig
importlib.reload(OverviewFig)
from OverviewFig import create_modlyn_figure

fig, caption = create_modlyn_figure()

# Dataset / Biological analysis

Figure 1: Expression Overview & Quality Control

Figure 2: Differential Expression Analysis

Figure 3: Cell Clustering Analysis

Figure 4: Drug Response Analysis

Figure 5: Scanpy Expression Analysis

!!!! Some mock functions

In [None]:
import gene_level_analysis
import importlib
importlib.reload(gene_level_analysis)

# Import the class from the module
from gene_level_analysis import GeneExpressionAnalyzer

# Now you can use it
analyzer = GeneExpressionAnalyzer(adata)
analyzer.figure_1_expression_overview()


# Or run the complete analysis
# analyzer.run_complete_gene_analysis()

In [None]:
# analyzer.figure_2_differential_expression() 
# analyzer.figure_3_cell_clustering_analysis()


In [None]:
analyzer.figure_4_drug_response_analysis()


In [None]:
analyzer.figure_5_scanpy_expression_analysis()


In [None]:
analyzer.generate_biological_narrative()