# Create the dataloader

In [None]:
from scdataloader import Dataset
from scdataloader import dataLoader

%load_ext autoreload
%autoreload 2

## var definition

In [None]:
organism = lb.Organism.filter(ontology_id="NCBITaxon:9606").one()
genedf = lb.Gene.filter(organism_id=organism.id).df()

### Optional: Var location 

here we decide to add another layer of information where we provide a gene's rough location estimation in the dataframe


In [None]:
from scdataloader.utils import getBiomartTable

biomart = getBiomartTable(attributes=['start_position', 'chromosome_name']).set_index('ensembl_gene_id')
genedf = genedf.set_index('ensembl_gene_id')

genedf = genedf.loc[~genedf.index.duplicated(keep='first')]
biomart = biomart.loc[~biomart.index.duplicated(keep='first')]

genedf = genedf.join(biomart[['start_position', 'chromosome_name']], how='inner').sort_values(by=['chromosome_name', 'start_position'])

downloading gene names from biomart

['ensembl_gene_id', 'hgnc_symbol', 'gene_biotype', 'entrezgene_id', 'start_position', 'chromosome_name']


In [None]:
c = []
i = 0
prev_position = -100000
prev_chromosome = None
for _, r in genedf.iterrows():
    if r['chromosome_name'] != prev_chromosome or r['start_position'] - prev_position > 10_000:
        i += 1
    c.append(i)
    prev_position = r['start_position']
    prev_chromosome = r['chromosome_name']
print(f'reduced the size to {len(set(c))/len(genedf)}')
genedf['group'] = c

reduced the size to 0.6722574020195106



In [None]:
#[File(uid='AnalH1SNJ2cQ7SVtsAvg', suffix='.h5ad', accessor='AnnData', description='preprocessed by scprint', version='2', size=59079604, hash='4f0no-pjg35qG--75wu5JZ', hash_type='sha1-fl', visibility=1, key_is_virtual=True, updated_at=2023-12-12 13:16:03 UTC, storage_id=1, initial_version_id=990, created_by_id=1), 

#File(uid='qsmZFgVcwPqVN9h23x6p', suffix='.h5ad', accessor='AnnData', description='preprocessed by scprint', version='2', size=82350434, hash='lUJl8wVAqHv1WM829YtELW', hash_type='sha1-fl', visibility=1, key_is_virtual=True, updated_at=2023-12-12 13:27:33 UTC, storage_id=1, initial_version_id=1034, created_by_id=1)]

### optional: var embeddings

Many novel models like transformers work on embeddings of the variable field. This can be learnt or provided like it is done here

In [None]:
embeddings = embed(genedf=genedf,
    organism="homo_sapiens",
    cache=True,
    fasta_path="/tmp/data/fasta/",
    embedding_size=1024,)
embeddings.to_parquet('../../data/temp/embeddings.parquet')

In [None]:
embeddings = pd.read_parquet('../../data/temp/embeddings.parquet')

## data loader

to create the dataloader we need a lamindb dataset. Here we take the one that we created in the previous notebook, but it can be another dataset like the lamin's cellxgene dataset.

example:
```python
dataset = ln.Dataset.using("laminlabs/cellxgene").one()
```

In [None]:
# OR directly load the dataset
name="preprocessed dataset"
dataset = ln.Dataset.filter(name=name).one()
dataset.artifacts.count()

0

In [None]:
# the dataloader can weight some rare samples more: 
# one need to provide the labels on which to weight the samples:
labels_weighted_sampling = hierarchical_labels+[
    'sex_ontology_term_id',
    "cell_type_ontology_term_id",
    #"tissue_ontology_term_id",
    "disease_ontology_term_id",
    #"development_stage_ontology_term_id",
    "assay_ontology_term_id",
    'self_reported_ethnicity_ontology_term_id',
]

# the dataloader can also output some obs field
all_labels = labels_weighted_sampling+[
    #'dataset_id',
    #'cell_culture',
    "dpt_group",
    "heat_diff",
    "nnz",
]

In [None]:
#we then create a mapped dataset. This transforms a bunch of anndata from possibly various species, into a combined object that acts roughly as a single anndata dataset 
# (WIP to get all the features of an anndata object) 
mdataset = Dataset(dataset, genedf, gene_embedding=embeddings, organisms=['"NCBITaxon:9606"'], obs=all_labels, encode_obs=labels_weighted_sampling, map_hierarchy=hierarchical_labels, )
mdataset

❗ no run & transform get linked, consider passing a `run` or calling ln.track()
won't do any check but we recommend to have your dataset coming from local storage
❗ no run & transform get linked, consider passing a `run` or calling ln.track()
total dataset size is 23.47712381 Gb
---
❗ no run & transform get linked, consider passing a `run` or calling ln.track()
total dataset size is 23.47712381 Gb
---
dataset contains:
     1582328 cells
     70116 genes
     8 labels
     1 organisms
dataset contains 113 classes to predict
embedding size is 1024




In [None]:
# now we make the dataloader
dataloader = BaseDataLoader(mdataset, label_to_weight=labels_weighted_sampling, batch_size=4, num_workers=1)
len(dataloader)

In [None]:
for i in dataloader:
    print(i)
    break

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64), tensor([30, 65, 78,  4]), tensor([5, 3, 3, 5]), tensor([1, 1, 1, 1]), tensor([2, 5, 3, 3]), tensor([0, 0, 1, 1]), ('13_MONDO:0100320_CL:0001062_UBERON:0000178', '2_PATO:0000461_CL:0000907_UBERON:0000178', '0_PATO:0000461_CL:0000938_UBERON:0000178', '7_MONDO:0100320_CL:0000794_UBERON:0000178'), tensor([0.0027, 0.0066, 0.0040, 0.0029], dtype=torch.float64), tensor([1206, 1953, 1005,  787])]



In [None]:
# (WIP) build a set of different collators that can be used to preprocess the minibatches before feeding them to the model 