In [1]:
%load_ext lab_black
%load_ext autotime
import pandas as pd
import numpy as np

time: 288 ms (started: 2022-09-19 08:31:05 -07:00)


RNASeq data, found via the [Picasso example notebook](https://github.com/pachterlab/picasso/blob/main/examplePicasso.ipynb). I think the publication reference is <https://doi.org/10.1038/s41586-021-03775-x> (was published at [biorXiv](https://www.biorxiv.org/content/10.1101/2020.07.02.184051v1) in 2020).

## Read count data

In [2]:
import gzip
from io import BytesIO

import requests
import scipy.io

req = requests.get(
    "https://data.caltech.edu/tindfiles/serve/772ca768-9c54-4fd8-9aeb-e78966177453",
    timeout=10,
)
data = scipy.io.mmread(BytesIO(gzip.decompress(req.content)))
data.shape

(3850, 1999)

time: 9.16 s (started: 2022-09-19 08:31:05 -07:00)


In [3]:
data

array([[4.33988086, 0.        , 4.46614853, ..., 5.03771355, 0.        ,
        2.80237598],
       [2.72633524, 0.        , 5.3227869 , ..., 0.        , 0.        ,
        2.85165059],
       [0.        , 0.        , 4.36213812, ..., 5.93728805, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 5.06350072, ..., 0.49238087, 0.        ,
        0.        ],
       [0.        , 0.        , 4.17808255, ..., 5.44504377, 0.        ,
        0.        ],
       [2.34737733, 0.        , 5.16608948, ..., 0.        , 0.        ,
        0.        ]])

time: 5.54 ms (started: 2022-09-19 08:31:14 -07:00)


## Read metadata

In [4]:
target = pd.read_csv(
    "https://data.caltech.edu/tindfiles/serve/e3ad6fcb-fbcc-4888-a5c9-3ae74346925b",
    compression="gzip",
    index_col=0,
)
target

Unnamed: 0,sample_name,smartseq_cluster_id,smartseq_cluster,sex_label,smartseq_cluster_color,medical_cond_label,cell_counts,n_genes,percent_mito,pass_count_filter,pass_mito_filter
SM-GE4R2_S062_E1-50,SM-GE4R2_S062_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,964371.56,9772,0.0,True,True
SM-GE4SI_S356_E1-50,SM-GE4SI_S356_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,973072.06,8253,0.0,True,True
SM-GE4SI_S172_E1-50,SM-GE4SI_S172_E1-50,46,Nr5a1_9|11 Rorb,M,#6C39A6,behavior - none,977132.60,9394,0.0,True,True
LS-15034_S07_E1-50,LS-15034_S07_E1-50,42,Nr5a1_4|7 Glipr1,M,#763BFF,behavior - none,948605.44,10643,0.0,True,True
LS-15034_S28_E1-50,LS-15034_S28_E1-50,42,Nr5a1_4|7 Glipr1,F,#763BFF,behavior - none,951803.00,10550,0.0,True,True
...,...,...,...,...,...,...,...,...,...,...,...
SM-GE65X_S36_E1-50,SM-GE65X_S36_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,953367.80,7354,0.0,True,True
SM-GE65X_S37_E1-50,SM-GE65X_S37_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,960790.20,8725,0.0,True,True
SM-GE65X_S46_E1-50,SM-GE65X_S46_E1-50,20,Nr5a1_Foxp2_1 Prdm13,M,#D92AAB,behavior - none,960140.44,7735,0.0,True,True
SM-GE65X_S69_E1-50,SM-GE65X_S69_E1-50,20,Nr5a1_Foxp2_1 Prdm13,F,#D92AAB,behavior - none,960288.50,7516,0.0,True,True


time: 3.31 s (started: 2022-09-19 08:31:14 -07:00)


## Pipeline

In [5]:
target_palette = dict(
    smartseq_cluster=dict(
        zip(target["smartseq_cluster"], target["smartseq_cluster_color"])
    )
)

time: 1.99 ms (started: 2022-09-19 08:31:17 -07:00)


In [6]:
from drnb.io.pipeline import create_default_pipeline

data_result = create_default_pipeline(check_for_duplicates=True).run(
    "lamanno2020",
    data=data,
    target=target,
    target_cols=["smartseq_cluster_id", "smartseq_cluster_color", "smartseq_cluster"],
    target_palette=target_palette,
    tags=["small", "highdim", "scRNAseq"],
    url="https://github.com/pachterlab/picasso/blob/main/examplePicasso.ipynb",
    verbose=True,
)

time: 12.1 s (started: 2022-09-19 08:31:17 -07:00)


In [7]:
from drnb.io.pipeline import create_default_pipeline

data_result = create_default_pipeline(check_for_duplicates=True, reduce=50).run(
    "lamanno2020-pca50",
    data=data,
    target=target,
    target_cols=["smartseq_cluster_id", "smartseq_cluster_color", "smartseq_cluster"],
    target_palette=target_palette,
    tags=["small", "scRNAseq"],
    url="https://github.com/pachterlab/picasso/blob/main/examplePicasso.ipynb",
    verbose=True,
)

time: 7.81 s (started: 2022-09-19 08:31:29 -07:00)
