In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np
import drnb as nb
import drnb.pipeline as pl

time: 4.63 s (started: 2022-09-11 22:32:18 -07:00)


RNASeq data, found via the [Picasso example notebook](https://github.com/pachterlab/picasso/blob/main/examplePicasso.ipynb). I think the publication reference is <https://doi.org/10.1038/s41586-021-03775-x> (was published at [biorXiv](https://www.biorxiv.org/content/10.1101/2020.07.02.184051v1) in 2020).

## Read count data

In [23]:
import gzip
from io import BytesIO

import requests
import scipy.io

req = requests.get(
    "https://data.caltech.edu/tindfiles/serve/772ca768-9c54-4fd8-9aeb-e78966177453",
    timeout=10,
)
data = scipy.io.mmread(BytesIO(gzip.decompress(req.content)))
data.shape

(3850, 1999)

time: 8.82 s (started: 2022-09-11 22:51:28 -07:00)


In [24]:
data

array([[4.33988086, 0.        , 4.46614853, ..., 5.03771355, 0.        ,
        2.80237598],
       [2.72633524, 0.        , 5.3227869 , ..., 0.        , 0.        ,
        2.85165059],
       [0.        , 0.        , 4.36213812, ..., 5.93728805, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 5.06350072, ..., 0.49238087, 0.        ,
        0.        ],
       [0.        , 0.        , 4.17808255, ..., 5.44504377, 0.        ,
        0.        ],
       [2.34737733, 0.        , 5.16608948, ..., 0.        , 0.        ,
        0.        ]])

time: 4.1 ms (started: 2022-09-11 22:51:37 -07:00)


## Read metadata

In [22]:
target = pd.read_csv(
    "https://data.caltech.edu/tindfiles/serve/e3ad6fcb-fbcc-4888-a5c9-3ae74346925b",
    compression="gzip",
    index_col=0,
)
target

Unnamed: 0,sample_name,smartseq_cluster_id,smartseq_cluster,sex_label,smartseq_cluster_color,medical_cond_label,cell_counts,n_genes,percent_mito,pass_count_filter,pass_mito_filter
SM-GE4R2_S062_E1-50,SM-GE4R2_S062_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,964371.56,9772,0.0,True,True
SM-GE4SI_S356_E1-50,SM-GE4SI_S356_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,973072.06,8253,0.0,True,True
SM-GE4SI_S172_E1-50,SM-GE4SI_S172_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,977132.60,9394,0.0,True,True
LS-15034_S07_E1-50,LS-15034_S07_E1-50,42,Nr5a1_4|7 Glipr1,M,#763BFF,behavior - none,948605.44,10643,0.0,True,True
LS-15034_S28_E1-50,LS-15034_S28_E1-50,42,Nr5a1_4|7 Glipr1,F,#763BFF,behavior - none,951803.00,10550,0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...
SM-GE65X_S36_E1-50,SM-GE65X_S36_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,953367.80,7354,0.0,True,True
SM-GE65X_S37_E1-50,SM-GE65X_S37_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,960790.20,8725,0.0,True,True
SM-GE65X_S46_E1-50,SM-GE65X_S46_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,960140.44,7735,0.0,True,True
SM-GE65X_S69_E1-50,SM-GE65X_S69_E1-50,20,Nr5a1_Foxp2_1 Prdm13,F,#D92AAB,behavior - none,960288.50,7516,0.0,True,True


time: 2.68 s (started: 2022-09-11 22:51:21 -07:00)


## Pipeline

In [28]:
target_palette = dict(
    smartseq_cluster=dict(
        zip(target["smartseq_cluster"], target["smartseq_cluster_color"])
    )
)

time: 2.56 ms (started: 2022-09-11 22:54:28 -07:00)


In [30]:
from drnb.dataset import create_data_pipeline

data_pipe = create_data_pipeline(
    convert=dict(dtype="float32", layout="c"),
    data_export=["csv", "npy"],
    target_cols=["smartseq_cluster_id", "smartseq_cluster_color", "smartseq_cluster"],
    target_export=["csv", "pkl"],
    neighbors=dict(
        n_neighbors=[15, 50, 150],
        method="exact",
        metric=["euclidean"],
        file_types=["csv", "npy"],
    ),
    triplets=dict(
        n_triplets_per_point=5,
        seed=1337,
        file_types=["csv", "npy"],
    ),
    verbose=True,
)

time: 10.6 ms (started: 2022-09-11 22:56:12 -07:00)


In [31]:
data_result = data_pipe.run(
    "lamanno2020", data=data, target=target, target_palette=target_palette, verbose=True
)

time: 8.62 s (started: 2022-09-11 22:56:17 -07:00)
