In [4]:
import os
import sys

# Add the root of the project to sys.path.
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from meta_pert_dataset import (
    MetaPertDataset,
    load_metabolic_model,
    print_dataset_metabolic_info,
)
from models.metabolic_model_transmet import print_subsystems_stats

from utils.filesystem import get_git_root

### Load metabolic model

The metabolic model object is reused in every dataset.

In [3]:
# Load the metabolic model
recon2_mat_model = load_metabolic_model("RECON2_mat")

print(recon2_mat_model)

print_subsystems_stats(recon2_mat_model)

Initializing metabolic model 'RECON2_mat' with the following parameters:
  species: homo_sapiens
  media: default-media
  isoform_summing: remove-summing
  exchange_limit: 1.0
Metabolic model initialized successfully.
Removing empty gene associations...
Empty gene associations removed.
Converting gene symbols to Ensembl IDs...
Some genes are not found in the cache. Requesting missing genes from the internet.
Missing genes length: 43
Requesting Ensembl IDs for batch 1 from the internet.
Gene symbols converted to Ensembl IDs.
Metabolic model loading complete.
MetabolicModelTransmet object
    name: RECON2_mat
    Total number of subsystems: 100
    Total number of reactions: 10211
    Total number of associated genes: 1404
Subsystems Statistics Report
Total number of subsystems: 100
----------------------------------------
Average number of reactions per subsystem: 74.40
Subsystem with the maximum reactions: 'Transport, extracellular' with 1550 reactions
Subsystem with the minimum reacti

## Metabolic analyses of the datasets

### Norman dataset

In [6]:
# Load the Norman dataset with the RECON2 metabolic model
norman_meta_ds = MetaPertDataset(
    name="norman",
    variant="preprocessed",
    dir_path=os.path.join(get_git_root(), "datasets"),
    metabolic_model=recon2_mat_model,
)
# You can optionally directly load a model inside the class by setting
# model_name="RECON2_mat"

# Normalize the dataset.
norman_meta_ds.normalize_(type="CPM")
# Adds information about the most variable genes to the dataset.
norman_meta_ds.most_variable_genes(n_top_genes=5000)

print_dataset_metabolic_info(norman_meta_ds, top_n_subsystems=10)

Loading: /mnt/md0/data/gdufort/transmet/datasets/norman/preprocessed/adata.h5ad
Using provided metabolic model for dataset 'norman' with variant 'preprocessed'.
Metabolic Model Information Report
Number of metabolic genes in the perturbation dataset: 1357
Number of metabolic genes among the most variable genes: 253
Percentage of variance captured by highly variable genes among all genes: 22.08%
Percentage of variance captured by metabolic genes among highly variable genes: 5.06%
Number of subsystems with metabolic genes in the perturbation dataset: 96/100
Number of subsystems with metabolic genes in the most variable genes: 76/100

Top 10 Subsystems by Variance Captured:
----------------------------------------
Subsystem: Transport, extracellular, Variance: 725.17, Percentage of metabolic variance: 12.65%, Genes in Most Variable: 47/196
Subsystem: Nucleotide interconversion, Variance: 364.85, Percentage of metabolic variance: 6.36%, Genes in Most Variable: 14/109
Subsystem: Oxidative p

### Dixit dataset

In [5]:
# Load the Dixit dataset with the RECON2 metabolic model
dixit_meta_ds = MetaPertDataset(
    name="dixit",
    variant="preprocessed",
    dir_path=os.path.join(get_git_root(), "datasets"),
    metabolic_model=recon2_mat_model,
)

# Normalize the dataset.
dixit_meta_ds.normalize_(type="CPM")
# Adds information about the most variable genes to the dataset.
dixit_meta_ds.most_variable_genes(n_top_genes=5000)
# Print the dataset metabolic information.
print_dataset_metabolic_info(dixit_meta_ds, top_n_subsystems=20)

Loading: /mnt/md0/data/gdufort/transmet/datasets/dixit/preprocessed/adata.h5ad
Metabolic Model Information Report
Number of metabolic genes in the perturbation dataset: 1237
Number of metabolic genes among the most variable genes: 337
Percentage of variance captured by highly variable genes among all genes: 22.93%
Percentage of variance captured by metabolic genes among highly variable genes: 6.75%
Number of subsystems with metabolic genes in the perturbation dataset: 95
Number of subsystems with metabolic genes in the most variable genes: 86

Top 20 Subsystems by Variance Captured:
----------------------------------------
Subsystem: Transport, extracellular, Variance: 660.06, Percentage of metabolic variance: 12.34%, Genes in Most Variable: 62/196
Subsystem: Nucleotide interconversion, Variance: 336.42, Percentage of metabolic variance: 6.29%, Genes in Most Variable: 14/109
Subsystem: Oxidative phosphorylation, Variance: 267.24, Percentage of metabolic variance: 5.00%, Genes in Most V

### Adamson

In [6]:
# Load the Adamson dataset with the RECON2 metabolic model
adamson_meta_ds = MetaPertDataset(
    name="adamson",
    variant="preprocessed",
    dir_path=os.path.join(get_git_root(), "datasets"),
    metabolic_model=recon2_mat_model,
)

# Normalize the dataset.
adamson_meta_ds.normalize_(type="CPM")
# Adds information about the most variable genes to the dataset.
adamson_meta_ds.most_variable_genes(n_top_genes=5000)
# Print the dataset metabolic information
print_dataset_metabolic_info(adamson_meta_ds, top_n_subsystems=20)

Loading: /mnt/md0/data/gdufort/transmet/datasets/adamson/preprocessed/adata.h5ad


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Metabolic Model Information Report
Number of metabolic genes in the perturbation dataset: 1349
Number of metabolic genes among the most variable genes: 236
Percentage of variance captured by highly variable genes among all genes: 23.12%
Percentage of variance captured by metabolic genes among highly variable genes: 4.72%
Number of subsystems with metabolic genes in the perturbation dataset: 96
Number of subsystems with metabolic genes in the most variable genes: 81

Top 20 Subsystems by Variance Captured:
----------------------------------------
Subsystem: Transport, extracellular, Variance: 655.44, Percentage of metabolic variance: 12.31%, Genes in Most Variable: 42/196
Subsystem: Nucleotide interconversion, Variance: 365.19, Percentage of metabolic variance: 6.86%, Genes in Most Variable: 22/109
Subsystem: Oxidative phosphorylation, Variance: 278.20, Percentage of metabolic variance: 5.22%, Genes in Most Variable: 2/90
Subsystem: Glycerophospholipid metabolism, Variance: 200.33, Perc

### Replogle rpe1

In [9]:
# Load the Replogle K562 dataset with the RECON2 metabolic model
replogle_rpe1_meta_ds = MetaPertDataset(
    name="replogle_rpe1_essential",
    variant="preprocessed",
    dir_path=os.path.join(get_git_root(), "datasets"),
    metabolic_model=recon2_mat_model,
)

# Normalize the dataset.
replogle_rpe1_meta_ds.normalize_(type="CPM")
# Adds information about the most variable genes to the dataset.
replogle_rpe1_meta_ds.most_variable_genes(n_top_genes=5000)
# Print the dataset metabolic information
print_dataset_metabolic_info(replogle_rpe1_meta_ds, top_n_subsystems=20)


Loading: /mnt/md0/data/gdufort/transmet/datasets/replogle_rpe1_essential/preprocessed/adata.h5ad
Metabolic Model Information Report
Number of metabolic genes in the perturbation dataset: 709
Number of metabolic genes among the most variable genes: 345
Percentage of variance captured by highly variable genes among all genes: 57.89%
Percentage of variance captured by metabolic genes among highly variable genes: 6.89%
Number of subsystems with metabolic genes in the perturbation dataset: 88
Number of subsystems with metabolic genes in the most variable genes: 84

Top 20 Subsystems by Variance Captured:
----------------------------------------
Subsystem: Oxidative phosphorylation, Variance: 261.85, Percentage of metabolic variance: 7.91%, Genes in Most Variable: 3/90
Subsystem: Nucleotide interconversion, Variance: 217.08, Percentage of metabolic variance: 6.56%, Genes in Most Variable: 29/109
Subsystem: Transport, extracellular, Variance: 199.55, Percentage of metabolic variance: 6.03%, G

### Replogle K562

In [10]:
# Load the Replogle K562 dataset with the RECON2 metabolic model
replogle_k562_meta_ds = MetaPertDataset(
    name="replogle_k562_essential",
    variant="preprocessed",
    dir_path=os.path.join(get_git_root(), "datasets"),
    metabolic_model=recon2_mat_model,
)

# Normalize the dataset.
replogle_k562_meta_ds.normalize_(type="CPM")
# Adds information about the most variable genes to the dataset.
replogle_k562_meta_ds.most_variable_genes(n_top_genes=5000)
# Print the dataset metabolic information
print_dataset_metabolic_info(replogle_k562_meta_ds, top_n_subsystems=20)

Loading: /mnt/md0/data/gdufort/transmet/datasets/replogle_k562_essential/preprocessed/adata.h5ad
Metabolic Model Information Report
Number of metabolic genes in the perturbation dataset: 680
Number of metabolic genes among the most variable genes: 348
Percentage of variance captured by highly variable genes among all genes: 59.07%
Percentage of variance captured by metabolic genes among highly variable genes: 6.95%
Number of subsystems with metabolic genes in the perturbation dataset: 90
Number of subsystems with metabolic genes in the most variable genes: 86

Top 20 Subsystems by Variance Captured:
----------------------------------------
Subsystem: Oxidative phosphorylation, Variance: 264.29, Percentage of metabolic variance: 8.41%, Genes in Most Variable: 20/90
Subsystem: Nucleotide interconversion, Variance: 186.56, Percentage of metabolic variance: 5.94%, Genes in Most Variable: 23/109
Subsystem: Transport, extracellular, Variance: 182.84, Percentage of metabolic variance: 5.82%, 