# Curating perturbation dataset

In [None]:
# !pip install 'lamindb[jupyter,aws,bionty]' 
!lamin init --storage ./test-perturbation --schema bionty,wetlab

In [None]:
!wget -nc https://zenodo.org/record/7041849/files/McFarlandTsherniak2020.h5ad

In [None]:
import lamindb as ln
import bionty as bt
import wetlab as wl
import anndata as ad
import numpy as np

In [None]:
adata = ad.read_h5ad("McFarlandTsherniak2020.h5ad")

In [None]:
adata = adata[np.random.choice(adata.n_obs, size=int(0.5 * adata.n_obs), replace=False), :].copy()

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

adata.obs

## Perturbations overview

In [None]:
adata.obs.perturbation.value_counts()

In [None]:
adata.obs.perturbation_type.value_counts()

In [None]:
adata.obs.time.value_counts()

## Setup for curation

In [None]:
# We are using the chebi/chembl chemistry/drug ontology for the drug perturbations
chebi_source = bt.Source.filter(entity="Drug", name="chebi").one()
wl.Compound.add_source(chebi_source)
compounds = wl.Compound.public()
compounds.df().head(3)

## Curation

In [None]:
categoricals = {
    "DepMap_ID": bt.CellLine.ontology_id,
    "cell_line": bt.CellLine.name,
    "disease": bt.Disease.name,
    "organism": bt.Organism.name,
    "perturbation_type": ln.ULabel.name,
    "sex": bt.Phenotype.name,
    "time": ln.ULabel.name,
    "tissue_type": ln.ULabel.name,
}
sources = {
    "DepMap_ID": bt.Source.filter(name="depmap").one(),
    "cell_line": bt.Source.filter(name="depmap").one(),
}


In [None]:
curate = ln.Curate.from_anndata(
    adata, 
    var_index=bt.Gene.ensembl_gene_id,
    categoricals=categoricals, 
    organism="human",
    sources=sources
)

In [None]:
curate.add_new_from_columns()

In [None]:
curate.validate()

In [None]:
# We found a mix of ensembl IDs and gene symbols in the var_index -> get all gene symbols to ensembl IDs
gene_mapper = bt.Gene.standardize(curate.non_validated["var_index"], field="symbol", return_field="ensembl_gene_id", return_mapper=True, organism="human")
adata.var.index = adata.var.index.map(lambda x: gene_mapper.get(x, x))

In [None]:
# We search for the disease
for disease in curate.non_validated["disease"]:
    print(bt.Disease.public().search(disease))

In [None]:
curate.non_validated["disease"]
adata.obs["disease"] = adata.obs["disease"].cat.rename_categories({"colon/colorectal cancer": "colorectal cancer",
                                                    "rhabdoid": "rhabdoid tumor",
                                                    "bladder cancer": "urinary bladder carcinoma",
                                                    "endometrial/uterine cancer": "uterine corpus cancer"})

In [None]:
adata.obs["cell_line"] = bt.CellLine.public(source=bt.Source.filter(name="depmap").one()).standardize(adata.obs["cell_line"], field="name")
bt.CellLine.public(source=bt.Source.filter(name="depmap").one()).inspect(adata.obs["cell_line"], field="name")

In [None]:
curate.add_validated_from_var_index()
curate.add_validated_from('DepMap_ID')
curate.add_new_from('perturbation_type')
curate.add_new_from('sex')
curate.add_new_from('time')
curate.add_new_from('tissue_type')
curate.add_validated_from('disease')
curate.add_new_from('cell_line')

In [None]:
curate = ln.Curate.from_anndata(
    adata, 
    var_index=bt.Gene.ensembl_gene_id,
    categoricals=categoricals, 
    organism="human",
    sources=sources
)
curate.validate()

In [None]:
adata = adata[:, ~adata.var.index.isin(curate.non_validated["var_index"])].copy()

In [None]:
curate = ln.Curate.from_anndata(
    adata, 
    var_index=bt.Gene.ensembl_gene_id,
    categoricals=categoricals, 
    organism="human",
)
curate.validate()

## Creating and associating the Perturbations

These are the direct targets of the perturbations.
It is therefore entirely possible or even likely that they affect a pathway.
However for simplicity, we only curate the direct targets here.

1. **AZD5591**: Unknown
2. **Afatinib**: **Proteins** - EGFR (Epidermal Growth Factor Receptor), HER2 (Human Epidermal growth factor Receptor 2)
3. **BRD3379**: Unknown
4. **Bortezomib**: **Protein complex** - Proteasome (specifically the 26S proteasome subunit)
5. **Dabrafenib**: **Gene/Protein** - BRAF (V600E mutation in the BRAF gene, which codes for a protein kinase)
6. **Everolimus**: **Protein** - mTOR (Mammalian Target of Rapamycin)
7. **Gemcitabine**: **Pathway/Process** - DNA synthesis (inhibition of ribonucleotide reductase and incorporation into DNA)
8. **Idasanutlin**: **Protein** - MDM2 (Mouse Double Minute 2 homolog)
9. **JQ1**: **Protein** - BRD4 (Bromodomain-containing protein 4)
10. **Navitoclax**: **Proteins** - BCL-2, BCL-XL (B-cell lymphoma 2 and B-cell lymphoma-extra large)
11. **Prexasertib**: **Protein** - CHK1 (Checkpoint kinase 1)
12. **Taselisib**: **Protein/Pathway** - PI3K (Phosphoinositide 3-kinase)
13. **Trametinib**: **Proteins** - MEK1/2 (Mitogen-Activated Protein Kinase Kinase 1 and 2)
14. **control**: Not applicable
15. **sgGPX4-1**: **Gene/Protein** - GPX4 (Glutathione Peroxidase 4)
16. **sgGPX4-2**: **Gene/Protein** - GPX4 (Glutathione Peroxidase 4)
17. **sgLACZ**: **Gene/Protein** - LACZ (β-galactosidase)
18. **sgOR2J2**: **Gene/Protein** - OR2J2 (Olfactory receptor family 2 subfamily J member 2)

The dataset has two types of perturbations: CRISPR and Compounds.
We will create their records and associated targets separately.

In [None]:
crispr_metadata = adata.obs[adata.obs["perturbation_type"] == "CRISPR"]
drug_metadata = adata.obs[adata.obs["perturbation_type"] == "drug"]

## GeneticTreatments

In [None]:
crispr_metadata

In [None]:
list(crispr_metadata["perturbation"].unique())

In [None]:
sgGPX4_1_treatment = wl.GeneticTreatment(
            system="CRISPR Cas9",
            name=f"sgGPX4-1 knockdown",
).save()
gpx4_prot = bt.Protein.from_source(gene_symbol="GPX4", organism="human").save()
gpx4_target = wl.TreatmentTarget(name="Glutathione Peroxidase 4").save()
gpx4_target.proteins.add(gpx4_prot)
sgGPX4_1_treatment.targets.add(gpx4_target)

In [None]:
sgGPX4_2_treatment = wl.GeneticTreatment(
            system="CRISPR Cas9",
            name=f"sgGPX4-2 knockdown",
).save()
sgGPX4_2_treatment.targets.add(gpx4_target)

In [None]:
sglacz_treatment = wl.GeneticTreatment(
            system="CRISPR Cas9",
            name=f"sgLACZ knockdown",
).save()
lacz_prot = bt.Protein.from_source(name="beta-galactosidase", organism="human").save()
lacz_target = wl.TreatmentTarget(name="beta-galactosidase").save()
lacz_target.proteins.add(lacz_prot)
sglacz_treatment.targets.add(lacz_target)

In [None]:
sgor2j2_treatment = wl.GeneticTreatment(
            system="CRISPR Cas9",
            name=f"or2j2 knockdown",
).save()
or2j2_prot = bt.Protein.from_source(name="Olfactory receptor 2J2", organism="human").save()
or2j2_target = wl.TreatmentTarget(name="Olfactory receptor family 2 subfamily J member 2").save()
or2j2_target.proteins.add(or2j2_prot)
sgor2j2_treatment.targets.add(or2j2_target)

## CompoundTreatments

In [None]:
drug_metadata

In [None]:
compounds = wl.Compound.from_values(drug_metadata["perturbation"], field="name", source=chebi_source)

In [None]:
bt.Source.df()

In [None]:
# For compounds 
# dose_unit
# dose_value
# perturbation
# chembl-ID

In [None]:
artifact = curate.save_artifact(description="McFarland AnnData")
-> associate the genetic treatments and compounds