### Now we can also find other Breast Cancer Datasets from popular Atlases
---
Atlases provide centralized access to data with standardized metadata.

In [2]:
!lamin load laminlabs/cellxgene

[92m→[0m connected lamindb: laminlabs/cellxgene


In [1]:
import lamindb as ln
import bionty as bt

[92m→[0m connected lamindb: laminlabs/lamindata


In [3]:
import scanpy as sc
import anndata as ad

In [None]:
!lamin info

In [None]:
disease = bt.Disease.lookup()
# OperationalError: connection to server at "database2.cmyfs24wugc3.us-east-1.rds.amazonaws.com" (3.211.54.97), port 5432 failed: Operation timed out
#	Is the server running on that host and accepting TCP/IP connections?

In [None]:
disease.breast_cancer

In [None]:
ln.Artifact.filter(
    suffix=".h5ad",  
    diseases__in=[
        disease.breast_cancer,
        disease.breast_carcinoma
    ],  
).order_by("created_at").df().head()

In [None]:
breast_artifact = ln.Artifact.get("F9rOMRMmMBhxmtbNkf8o")

In [21]:
breast_artifact.describe()

[93m![0m This is not the latest version of the Artifact.
[93m![0m This is not the latest version of the Artifact.


In [None]:
breast_artifact.cache()

In [23]:
adata = breast_artifact.load()

In [24]:
adata

AnnData object with n_obs × n_vars = 12510 × 33234
    obs: 'mapped_reference_assembly', 'mapped_reference_annotation', 'alignment_software', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'donor_menopausal_status', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'sample_source', 'donor_BMI_at_collection', 'suspension_depleted_cell_types', 'suspension_derivation_process', 'suspension_dissociation_reagent', 'suspension_dissociation_time', 'suspension_percent_cell_viability', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'n_count_rna', 'n_feature_rna', 'percent_mito', 'percent_rb', 'tissue_location', 'bmi_group', 'procedure_group', 'age_group', 'samp

In [26]:
adata.write_h5ad("./adata.h5ad")

### Now we can restart the kernel session and connect to our local instance
---
Once we are connected to our local instance we can register the newly downloaded adata file.

In [None]:
!lamin connect laminlabs/lamindata

[92m→[0m connected lamindb: anonymous/postgres


In [8]:
import lamindb as ln
import bionty as bt
import scanpy as sc

In [None]:
ln.Project.df()

In [None]:
ln.track(project="9M7iVOgQ8VAQ")

In [None]:
ln.Artifact.df()

In [9]:
adata = sc.read_h5ad("./adata.h5ad")
adata

AnnData object with n_obs × n_vars = 12510 × 33234
    obs: 'mapped_reference_assembly', 'mapped_reference_annotation', 'alignment_software', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'donor_menopausal_status', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'sample_source', 'donor_BMI_at_collection', 'suspension_depleted_cell_types', 'suspension_derivation_process', 'suspension_dissociation_reagent', 'suspension_dissociation_time', 'suspension_percent_cell_viability', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'n_count_rna', 'n_feature_rna', 'percent_mito', 'percent_rb', 'tissue_location', 'bmi_group', 'procedure_group', 'age_group', 'samp

#### We can add columns site and patient in the obs 

In [10]:
adata.obs["site"] = adata.obs["disease"].apply(lambda x: "primary" if x == "breast cancer" else "normal")
adata.obs["patient"] = adata.obs["donor_id"]

In [12]:
adata.write_h5ad("./adata.h5ad")

#### We can create an artifact now

In [13]:
file_path = "./adata.h5ad"
artifact = ln.Artifact(file_path, description="Breast Cancer data download from public CellxGene Atlas")

In [None]:
artifact.describe()

#### We can add similar labels to our newly added dataset as we did for GEO GSE180286 Breast Cancer dataset

In [None]:
## Add Filtered as ULabel
ulable_lookup = ln.ULabel.lookup()

In [None]:
ulable_lookup.filtered

In [None]:
# We can now add this filtered Label to the artifact
artifact.ulabels.add(ulable_lookup.filtered)

In [None]:
artifact.describe()

In [None]:
artifact.save()

### Similarly, we can add features:

In [20]:
disease_lookup = bt.Disease.lookup()

In [None]:
disease_lookup.breast_cancer

In [None]:
# we can now annotate our artifact
artifact.features.add_values(
    {
        "experiment": "Breast Cancer scRNA-seq Experiment 2 from Cell X Gene Atlas",
        "disease": disease_lookup.breast_cancer,
        "author": "Sunny Sun"
    }
)

In [None]:
artifact.describe()

#### Now we have both our artifacts ingest and we can create our collection 

In [None]:
# create new collection
artifact1 = ln.Artifact.get("")
artifact2 = ln.Artifact.get("")
collection = ln.Collection(artifacts=[artifact1, artifact2], key="Breast Cancer Collection")
collection.save()

In [26]:
adata1 = artifact1.load()

In [None]:
adata1.obs.head()

In [None]:
adata2 = artifact2.load()
adata2.obs.head()

#### Using this collection we can create a mapped collection which virtually joins the two `adata`s 

In [None]:
# create mapped collection with specific observation keys
mapped_collection = collection.mapped(obs_keys=["site"], join="outer")

In [30]:
mapped_collection.shape

(np.int64(54559), 35234)

#### Now we can integrate it with PyTorch DataLoader:
---
The MappedCollection can be used directly with PyTorch's DataLoader for efficient data loading during model training.​

In [None]:
from torch.utils.data import DataLoader, WeightedRandomSampler

# convert MappedCollection to PyTorch-compatible dataset
dataset = mapped_collection

dataloader = DataLoader(dataset, batch_size=10000, shuffle=True)

# iterate through  DataLoader
for batch in dataloader:
    print(batch)  # each batch contains data from mapped collection

In [None]:
ln.finish()