# Create the dataloader

In [1]:
! lamin load scdataloader

💡 found cached instance metadata: /home/ml4ig1/.lamin/instance--jkobject--scdataloader.env
💡 loaded instance: jkobject/scdataloader
💡 loaded instance: jkobject/scdataloader


In [2]:
import tqdm

from scdataloader import DataModule

%load_ext autoreload
%autoreload 2

💡 lamindb instance: jkobject/scdataloader


In [None]:
datamodule = DataModule(
    collection_name="preprocessed dataset",
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    batch_size=64,
    num_workers=1,
    validation_split=0.1,
    test_split=0)

## or can be a much more complex dataloader too!

In [3]:
hierarchical_labels = [
    "cell_type_ontology_term_id",
    #"tissue_ontology_term_id"
    "disease_ontology_term_id",
    #"development_stage_ontology_term_id",
    "assay_ontology_term_id",
    'self_reported_ethnicity_ontology_term_id',
]
labels_to_pred = hierarchical_labels+[
    'sex_ontology_term_id',
    "organism_ontology_term_id",
]
all_labels = labels_to_pred+[
    #'dataset_id',
    #'cell_culture',
    "heat_diff",
    "total_counts",
    "nnz",
    "dpt_group",
]

name="preprocessed dataset"

## data loader

to create the dataloader we need a lamindb dataset. Here we take the one that we created in the previous notebook, but it can be another dataset like the lamin's cellxgene dataset.

example:
```python
dataset = ln.Collection.using("laminlabs/cellxgene").one()
```

In [5]:
datamodule = DataModule(
    collection_name="preprocessed dataset",
    all_labels=all_labels, #all the labels to query in the obs field
    hierarchical_labels=hierarchical_labels, #labels that can benefit from ontological hierarchies 
    organisms=["NCBITaxon:9606"], #organism that we will work on
    how="most expr", # for the collator (most expr genes only will be selected)
    max_len=1000, # only the 1000 most expressed
    add_zero_genes=100, #some additional zeros will be given
    label_to_weight=labels_to_pred, # for weighted random sampling
    label_to_pred=labels_to_pred,
    batch_size=64,
    num_workers=1,
    validation_split=0.2,
    test_split=0)

# we setup the datamodule (as exemplified in lightning's good practices, but there might be some things to improve here)
testfiles = datamodule.setup() 

won't do any check but we recommend to have your dataset coming from local storage


100.0% are aligned
total dataset size is 0.917606818 Gb
---
dataset contains:
     23349 cells
     70116 genes
     10 labels
     1 organisms
dataset contains 40 classes to predict

downloading gene names from biomart
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name']
['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name']
reduced the size to 0.6722574020195106


In [7]:
for i in tqdm.tqdm(datamodule.train_dataloader()):
    # pass #or do pass
    print(i)
    break

  0%|          | 0/292 [00:00<?, ?it/s]

{'x': tensor([[ 78.,   6.,   6.,  ...,   0.,   0.,   0.],
        [141.,  75.,  58.,  ...,   0.,   0.,   0.],
        [309.,  50.,  31.,  ...,   0.,   0.,   0.],
        ...,
        [157., 108.,  79.,  ...,   0.,   0.,   1.],
        [303., 123.,  70.,  ...,   0.,   0.,   0.],
        [136.,  29.,  22.,  ...,   0.,   0.,   0.]]), 'genes': tensor([[41514,   725,  9560,  ..., 23989, 20098, 39181],
        [41514, 15694,  9164,  ..., 47038, 10040, 54239],
        [41514, 16072, 12461,  ..., 59205, 16411, 67531],
        ...,
        [41514,  1583,  8960,  ..., 62974, 57751, 14310],
        [41514, 13107,  9164,  ..., 20352, 32101,  9779],
        [41514, 15694,   409,  ..., 50807, 36053, 38710]], dtype=torch.int32), 'class': tensor([[ 2,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0,  0],
        [ 3,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 9,  0,  0,  0,  0,  0],
        [ 4,  0,  0,  0,  0,  0],
        [ 7,  0,  0,  0,  0, 

  0%|          | 0/292 [00:03<?, ?it/s]



In [None]:
# .. 
# with lightning:
# Trainer(model, datamodule)

In [None]:
# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model 