In [30]:
import json

import pertdata

In [31]:
def filter_genes(dataset, column_name, genes):
    """Filter the genes in the dataset based on a provided list and column name.

    Args:
        dataset: The dataset (e.g., an AnnData object) containing .obs with gene information.
        column_name (str): The column name in dataset.obs to filter on.
        genes (list): A list of gene identifiers to filter for.

    Returns:
        list: A list of unique genes from the column that are present in the provided gene list.
    """
    filtered_genes = (
        dataset.obs[column_name][dataset.obs[column_name].isin(genes)].unique().tolist()
    )
    return filtered_genes


In [32]:
datasets_dict = pertdata.datasets()
print("Available datasets:")
for key in datasets_dict.keys():
    print(f"  {key}")


Available datasets:
  DixitRegev2016
  NormanWeissman2019_filtered
  ReplogleWeissman2022_K562_essential
  ReplogleWeissman2022_rpe1
  adamson
  dixit
  jHepG2_essential
  jurkat_essential
  norman
  replogle_k562_essential
  replogle_rpe1_essential
  wessel_dataset


In [33]:
from metavis import init_model_metavis

recon2_mat_model = init_model_metavis("RECON2_mat")
meta_genes = recon2_mat_model.get_genes_ensembl_ids()

Initializing metabolic model 'RECON2_mat' with the following parameters:
  species: homo_sapiens
  media: default-media
  isoform_summing: remove-summing
  exchange_limit: 1.0
Loaded 6941 reaction-subsystem pairs.
Metabolic model initialized successfully.
Removing empty gene associations...
Empty gene associations removed.
Converting gene symbols to Ensembl IDs...
Gene symbols converted to Ensembl IDs.
Metabolic model loading complete.


In [34]:
carlos_genes = [
    "DHFR",
    "PSAT1",
    "RRM1",
    "RRM2",
]

carlos_genes_ensmbl_ids = [
    "ENSG00000228716",
    "ENSG00000135069",
    "ENSG00000167325",
    "ENSG00000171848",
]

In [35]:
min_cell_count = 100
percentage = 0.5
amount = 30

# jHepG2 essential

In [36]:
selected_dataset = "jHepG2_essential"
print(f"Details for '{selected_dataset}':")
print(json.dumps(datasets_dict[selected_dataset], indent=2))

Details for 'jHepG2_essential':
{
  "name": "jHepG2-essential",
  "info": "",
  "publication": {
    "doi": "https://www.biorxiv.org/content/10.1101/2024.07.03.601903v1.full.pdf"
  },
  "repository": "GEO",
  "url": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE264667&format=file&file=GSE264667%5Fhepg2%5Fraw%5Fsinglecell%5F01%2Eh5ad"
}


In [37]:
jHepG2_essential = pertdata.PertDataset(name="jHepG2_essential")
print(jHepG2_essential)


Dataset already cached: /home/gdufort/.cache/pertdata/jHepG2_essential
Loading: /home/gdufort/.cache/pertdata/jHepG2_essential/adata.h5ad
PertDataset object
    name: jHepG2_essential
    cache_dir_path: /home/gdufort/.cache/pertdata
    path: /home/gdufort/.cache/pertdata/jHepG2_essential
    adata: AnnData object with n_obs ✕ n_vars = 145473 ✕ 9624


In [38]:
column_name = "gene_id"

jHepG2_unique_filtered_genes = filter_genes(
    jHepG2_essential.adata, column_name, carlos_genes_ensmbl_ids
)

print(len(jHepG2_unique_filtered_genes))


4


In [39]:
# Add the control cell line
jHepG2_unique_filtered_genes.append("non-targeting")


In [40]:
jHepG2_essential.pseudobulk_by_condition(
    "gene_id",
    replace=True,
    filter_values=jHepG2_unique_filtered_genes,
    min_cell_count=0,
    percentage=percentage,
    amount=amount,
)


Processing group non-targeting with 4976 cells.
Processing group ENSG00000171848 with 54 cells.
Processing group ENSG00000228716 with 21 cells.
Processing group ENSG00000167325 with 12 cells.
Processing group ENSG00000135069 with 69 cells.


In [41]:
# jHepG2_essential.normalize(type="CPM")


In [42]:
jHepG2_essential.export_tsv(
    "Not-norm-jHepG2_essential_Carlos_genes_pseudobulk"
    + "_m"
    + str(min_cell_count)
    + "_p"
    + str(percentage)
    + "_a"
    + str(amount)
    + ".tsv"
)


Exporting all 150 samples to: Not-norm-jHepG2_essential_Carlos_genes_pseudobulk_m100_p0.5_a30.tsv


# Jurkat essential

In [43]:
selected_dataset = "jurkat_essential"
print(f"Details for '{selected_dataset}':")
print(json.dumps(datasets_dict[selected_dataset], indent=2))


Details for 'jurkat_essential':
{
  "name": "jurkat-essential",
  "info": "",
  "publication": {
    "doi": "https://www.biorxiv.org/content/10.1101/2024.07.03.601903v1.full.pdf"
  },
  "repository": "GEO",
  "url": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE264667&format=file&file=GSE264667%5Fjurkat%5Fraw%5Fsinglecell%5F01%2Eh5ad"
}


In [44]:
jurkat_essential = pertdata.PertDataset(name="jurkat_essential")
print(jurkat_essential)

Dataset already cached: /home/gdufort/.cache/pertdata/jurkat_essential
Loading: /home/gdufort/.cache/pertdata/jurkat_essential/adata.h5ad
PertDataset object
    name: jurkat_essential
    cache_dir_path: /home/gdufort/.cache/pertdata
    path: /home/gdufort/.cache/pertdata/jurkat_essential
    adata: AnnData object with n_obs ✕ n_vars = 262956 ✕ 8882


In [45]:
column_name = "gene_id"

jurkat_unique_filtered_genes = filter_genes(
    jurkat_essential.adata, column_name, carlos_genes_ensmbl_ids
)
print(len(jurkat_unique_filtered_genes))


4


In [46]:
# Add the control cell line
jurkat_unique_filtered_genes.append("non-targeting")


In [47]:
jurkat_essential.pseudobulk_by_condition(
    "gene_id",
    replace=True,
    filter_values=jHepG2_unique_filtered_genes,
    min_cell_count=0,
    percentage=percentage,
    amount=amount,
)


Processing group non-targeting with 12013 cells.
Processing group ENSG00000135069 with 122 cells.
Processing group ENSG00000171848 with 40 cells.
Processing group ENSG00000228716 with 63 cells.
Processing group ENSG00000167325 with 3 cells.


In [48]:
# jurkat_essential.normalize(type="CPM")


In [49]:
jurkat_essential.export_tsv(
    "Not-norm-jurkat_essential_Carlos_genes_pseudobulk"
    + "_m"
    + str(min_cell_count)
    + "_p"
    + str(percentage)
    + "_a"
    + str(amount)
    + ".tsv"
)


Exporting all 150 samples to: Not-norm-jurkat_essential_Carlos_genes_pseudobulk_m100_p0.5_a30.tsv


# ReplogleRPE1 essential

In [50]:
selected_dataset = "ReplogleWeissman2022_rpe1"
print(f"Details for '{selected_dataset}':")
print(json.dumps(datasets_dict[selected_dataset], indent=2))


Details for 'ReplogleWeissman2022_rpe1':
{
  "name": "ReplogleWeissman2022_rpe1",
  "info": "",
  "publication": {
    "doi": "10.1016/j.cell.2022.05.013"
  },
  "repository": "scPerturb",
  "url": "https://zenodo.org/record/7041849/files/ReplogleWeissman2022_rpe1.h5ad"
}


In [51]:
ReplogleWeissman2022_rpe1 = pertdata.PertDataset(name="ReplogleWeissman2022_rpe1")
print(ReplogleWeissman2022_rpe1)


Dataset already cached: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_rpe1
Loading: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_rpe1/adata.h5ad
PertDataset object
    name: ReplogleWeissman2022_rpe1
    cache_dir_path: /home/gdufort/.cache/pertdata
    path: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_rpe1
    adata: AnnData object with n_obs ✕ n_vars = 247914 ✕ 8749


In [52]:
column_name = "gene_id"

rpe1_unique_filtered_genes = filter_genes(
    ReplogleWeissman2022_rpe1.adata, column_name, carlos_genes_ensmbl_ids
)
print(len(rpe1_unique_filtered_genes))


4


In [53]:
# Add the control cell line
rpe1_unique_filtered_genes.append("non-targeting")


In [54]:
ReplogleWeissman2022_rpe1.pseudobulk_by_condition(
    "gene_id",
    replace=True,
    filter_values=rpe1_unique_filtered_genes,
    min_cell_count=0,
    percentage=percentage,
    amount=amount,
)


Processing group non-targeting with 11485 cells.
Processing group ENSG00000167325 with 67 cells.
Processing group ENSG00000171848 with 229 cells.
Processing group ENSG00000135069 with 160 cells.
Processing group ENSG00000228716 with 63 cells.


In [55]:
# ReplogleWeissman2022_rpe1.normalize(type="CPM")


In [56]:
ReplogleWeissman2022_rpe1.export_tsv(
    "Not-norm-ReplogleWeissman2022_rpe1_essential_Carlos_genes_pseudobulk"
    + "_m"
    + str(min_cell_count)
    + "_p"
    + str(percentage)
    + "_a"
    + str(amount)
    + ".tsv"
)


Exporting all 150 samples to: Not-norm-ReplogleWeissman2022_rpe1_essential_Carlos_genes_pseudobulk_m100_p0.5_a30.tsv


# ReplogleK526 essential

In [57]:
selected_dataset = "ReplogleWeissman2022_K562_essential"
print(f"Details for '{selected_dataset}':")
print(json.dumps(datasets_dict[selected_dataset], indent=2))


Details for 'ReplogleWeissman2022_K562_essential':
{
  "name": "ReplogleWeissman2022_K562_essential",
  "info": "",
  "publication": {
    "doi": "10.1016/j.cell.2022.05.013"
  },
  "repository": "scPerturb",
  "url": "https://zenodo.org/record/7041849/files/ReplogleWeissman2022_K562_essential.h5ad"
}


In [58]:
ReplogleWeissman2022_K562_essential = pertdata.PertDataset(
    name="ReplogleWeissman2022_K562_essential"
)
print(ReplogleWeissman2022_K562_essential)


Dataset already cached: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_K562_essential
Loading: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_K562_essential/adata.h5ad
PertDataset object
    name: ReplogleWeissman2022_K562_essential
    cache_dir_path: /home/gdufort/.cache/pertdata
    path: /home/gdufort/.cache/pertdata/ReplogleWeissman2022_K562_essential
    adata: AnnData object with n_obs ✕ n_vars = 310385 ✕ 8563


In [59]:
column_name = "gene_id"

replogle_unique_filtered_genes = filter_genes(
    ReplogleWeissman2022_K562_essential.adata, column_name, carlos_genes_ensmbl_ids
)
print(len(replogle_unique_filtered_genes))

4


In [60]:
# Add the control cell line
replogle_unique_filtered_genes.append("non-targeting")


In [61]:
ReplogleWeissman2022_K562_essential.pseudobulk_by_condition(
    "gene_id",
    replace=True,
    filter_values=replogle_unique_filtered_genes,
    min_cell_count=0,
    percentage=percentage,
    amount=amount,
)


Processing group ENSG00000171848 with 577 cells.
Processing group non-targeting with 10691 cells.
Processing group ENSG00000228716 with 125 cells.
Processing group ENSG00000135069 with 72 cells.
Processing group ENSG00000167325 with 103 cells.


In [62]:
# ReplogleWeissman2022_K562_essential.normalize(type="CPM")


In [63]:
ReplogleWeissman2022_K562_essential.export_tsv(
    "Not-norm-ReplogleWeissman2022_K562_essential_Carlos_genes_pseudobulk"
    + "_m"
    + str(min_cell_count)
    + "_p"
    + str(percentage)
    + "_a"
    + str(amount)
    + ".tsv"
)


Exporting all 150 samples to: Not-norm-ReplogleWeissman2022_K562_essential_Carlos_genes_pseudobulk_m100_p0.5_a30.tsv


In [None]:
selected_dataset = "ReplogleWeissman2022_rpe1"
print(f"Details for '{selected_dataset}':")
print(json.dumps(datasets_dict[selected_dataset], indent=2))


Details for 'ReplogleWeissman2022_rpe1':
{
  "name": "ReplogleWeissman2022_rpe1",
  "info": "",
  "publication": {
    "doi": "10.1016/j.cell.2022.05.013"
  },
  "repository": "scPerturb",
  "url": "https://zenodo.org/record/7041849/files/ReplogleWeissman2022_rpe1.h5ad"
}


In [None]:
# Find the intersection between the three unique filtered genes lists
common_genes = set(jHepG2_unique_filtered_genes).intersection(
    jurkat_unique_filtered_genes,
    replogle_unique_filtered_genes,
    rpe1_unique_filtered_genes,
)

In [None]:
print(len(common_genes))

In [None]:
print(common_genes)

In [None]:
# Print the gene names associated to the common genes (in column gene_id) obtained from ReplogleWeissman2022_K562_essential
column_name = "gene_id"
common_genes_names = (
    ReplogleWeissman2022_K562_essential.adata.obs["gene"][
        ReplogleWeissman2022_K562_essential.adata.obs[column_name].isin(common_genes)
    ]
    .unique()
    .tolist()
)
print(common_genes_names)

In [None]:
# Add the control cell line
unique_filtered_genes.append("non-targeting")

In [None]:
min_cell_count = 100
percentage = 0.5
amount = 30

In [None]:
ReplogleWeissman2022_K562_essential.pseudobulk_by_condition(
    "gene_id",
    replace=True,
    filter_values=unique_filtered_genes,
    min_cell_count=min_cell_count,
    percentage=percentage,
    amount=amount,
)


In [None]:
ReplogleWeissman2022_K562_essential.adata.obs


In [None]:
# ReplogleWeissman2022_K562_essential.normalize(type="CPM")


In [None]:
ReplogleWeissman2022_K562_essential.export_tsv(
    "Non-norm-ReplogleWeissman2022_K562_essential_pseudobulk"
    + "_m"
    + str(min_cell_count)
    + "_p"
    + str(percentage)
    + "_a"
    + str(amount)
    + ".tsv"
)