Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
629a52d
update of scDataloader to new laminDB and WIP update the full cellxge…
jkobject Nov 19, 2024
003a123
some more improvements to the runs
jkobject Nov 25, 2024
c1ad650
merged
jkobject Nov 26, 2024
2e325af
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Nov 26, 2024
18adc6b
nothing really
jkobject Nov 26, 2024
1f7d9db
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Nov 26, 2024
77be57b
wip
jkobject Nov 26, 2024
2a95e0f
making it work offline, some debugs and activating no replacement tes…
jkobject Nov 27, 2024
d670360
adding metacellmode
jkobject Dec 3, 2024
60bdb65
improvement on preprocess and label weights
jkobject Dec 3, 2024
46d33bc
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Dec 3, 2024
4e26fbc
updating the versions, working with age levels, debug of my additions…
jkobject Dec 5, 2024
3fc52b7
more translation and now working with depth in wrs
jkobject Dec 5, 2024
e905e66
Revert "removing the clss to pred"
jkobject Dec 5, 2024
62dcc92
removing all clss
jkobject Dec 5, 2024
4ba8dff
metacell mode, age group, dev stage, nnz scaling, improvement in prep…
jkobject Dec 11, 2024
23fe48a
some slight debugs and improvements
jkobject Dec 12, 2024
0a6a70c
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Dec 12, 2024
3ad2311
adding runs
jkobject Dec 12, 2024
cb375a0
improving mapped
jkobject Dec 16, 2024
c94427f
dbug collator
jkobject Dec 17, 2024
22630f2
faster dataloader
jkobject Dec 17, 2024
9a5ce0c
nothing
jkobject Dec 19, 2024
b4c12af
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Dec 19, 2024
bb78c2c
by default allowing the dataloader to output visium and slide-seq
jkobject Dec 23, 2024
5ab08fd
nothing
jkobject Dec 24, 2024
d60ed65
Merge branch 'dev' of https://github.com/jkobject/scDataLoader into dev
jkobject Dec 24, 2024
7ebccde
cleanups
jkobject Jan 4, 2025
6a3e468
list the full dataset
jkobject Jan 7, 2025
02347ea
make it work on multiple restart of pl
jkobject Jan 8, 2025
99718ca
Fix type check for organism
falexwolf Jan 9, 2025
39af532
Merge pull request #14 from jkobject/recordlist
jkobject Jan 9, 2025
5fb8821
dbug of high var
jkobject Jan 9, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ dmypy.json
# templates
.github/templates/*
.DS_Store
figures/*/*.png
figures/*.png
figures/add_postp_clust.py
figures/age_relabel.py
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ release: ## Create a new tag for release.
@read -p "Version? (provide the next x.y.z semver) : " TAG
@echo "$${TAG}" > scdataloader/VERSION
@sed -i 's/^version = .*/version = "'$${TAG}'"/' pyproject.toml
@sed -i 's/__version__ = .*/__version__ = "'$${TAG}'"/' scdataloader/__init__.py
@$(ENV_PREFIX)gitchangelog > HISTORY.md
@git add scdataloader/VERSION HISTORY.md pyproject.toml
@git commit -m "release: version $${TAG} 🚀"
Expand Down
1,555 changes: 1,555 additions & 0 deletions figures/debug.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions notebooks/nonprimary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
OoktqBIu8jCoGOJl
n33nFE2kXSNzNhIA
mtoOxeGG0Rg3NPH1
V0tqrgE1z1NY2eUU
4,259 changes: 4,259 additions & 0 deletions notebooks/update_lamin_or_cellxgene.ipynb

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions notebooks/work_on_dataloader_onto part 3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -928,24 +928,24 @@
],
"source": [
"# Celltype\n",
"names = bt.CellType().df().index\n",
"names = bt.CellType().filter().df().index\n",
"records = lb.CellType.from_values(names, field=lb.CellType.ontology_id)\n",
"ln.save(records)\n",
"lb.CellType(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Organism\n",
"# names = bt.Organism().df().index\n",
"# names = bt.Organism().filter().df().index\n",
"names = ['NCBITaxon:10090', 'NCBITaxon:9606']\n",
"records = lb.Organism.from_values(names, field=lb.Organism.ontology_id)\n",
"ln.save(records)\n",
"lb.Organism(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Phenotype\n",
"#name = bt.Phenotype().df().index\n",
"#name = bt.Phenotype().filter().df().index\n",
"name = df['sex_ontology_term_id'].unique()\n",
"records = lb.Phenotype.from_values(name, field=lb.Phenotype.ontology_id)\n",
"ln.save(records)\n",
"lb.Phenotype(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# ethnicity\n",
"names = bt.Ethnicity().df().index\n",
"names = bt.Ethnicity().filter().df().index\n",
"records = lb.Ethnicity.from_values(names, field=lb.Ethnicity.ontology_id)\n",
"ln.save(records)\n",
"lb.Ethnicity(name=\"unknown\", ontology_id=\"unknown\").save() #multi ethnic will have to get renamed\n",
Expand All @@ -958,25 +958,25 @@
"lookup = lb.ExperimentalFactor.lookup()\n",
"lookup.smart_seq_v4.parents.add(lookup.smart_like)\n",
"# Tissue\n",
"#names = bt.Tissue().df().index\n",
"#names = bt.Tissue().filter().df().index\n",
"names= df['tissue_ontology_term_id'].unique()\n",
"records = lb.Tissue.from_values(names, field=lb.Tissue.ontology_id)\n",
"ln.save(records)\n",
"lb.Tissue(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# DevelopmentalStage\n",
"bionty_df = bt.DevelopmentalStage().df()\n",
"bionty_df = bt.DevelopmentalStage().filter().df()\n",
"records = lb.DevelopmentalStage.from_values(bionty_df.index, field=lb.DevelopmentalStage.ontology_id, organism=\"mouse\")\n",
"ln.save(records)\n",
"lb.DevelopmentalStage(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Disease\n",
"# values = bt.Disease().df().index\n",
"# values = bt.Disease().filter().df().index\n",
"values = df['disease_ontology_term_id'].unique()\n",
"records = lb.Disease.from_values(values, field=lb.Disease.ontology_id)\n",
"ln.save(records)\n",
"lb.Disease(name=\"normal\", ontology_id=\"PATO:0000461\").save()\n",
"lb.Disease(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# genes\n",
"bionty_df = bt.Gene().df()\n",
"bionty_df = bt.Gene().filter().df()\n",
"records = lb.Gene.from_values(bionty_df.index, field=lb.Gene.ontology_id)\n",
"ln.save(records)"
]
Expand Down Expand Up @@ -1050,7 +1050,7 @@
}
],
"source": [
"lb.DevelopmentalStage(organism=\"mouse\").df()"
"lb.DevelopmentalStage(organism=\"mouse\").filter().df()"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ authors = [
]
license = "MIT"
readme = "README.md"
requires-python = ">=3.10,<3.11"
requires-python = ">=3.10,<3.14"
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
dependencies = [
"numpy>=1.26.0",
"lamindb[bionty]==0.76.12",
"numpy==1.26.0",
"lamindb[bionty]==0.77.2",
"cellxgene-census>=0.1.0",
"torch==2.2.0",
"lightning>=2.0.0",
Expand Down
2 changes: 2 additions & 0 deletions scdataloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
from .data import Dataset, SimpleAnnDataset
from .datamodule import DataModule
from .preprocess import Preprocessor

__version__ = "1.6.5"
46 changes: 38 additions & 8 deletions scdataloader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
)


# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" \
# --description="scPRINT-V2 datasets" --new_name="scprint v2" --n_hvg_for_postp=4000 --cache=False \
# --filter_gene_by_counts=0 --filter_cell_by_counts=300 --min_valid_genes_id=500 \
# --min_nnz_genes=120 --min_dataset_size=100 --maxdropamount=90 \
# --organisms=["NCBITaxon:9606","NCBITaxon:9544","NCBITaxon:9483","NCBITaxon:10090"] \
# --start_at=0
def main():
"""
main function to preprocess datasets in a given lamindb collection.
Expand Down Expand Up @@ -70,7 +75,7 @@ def main():
help="Determines whether to normalize the total counts of each cell to a specific value.",
)
parser.add_argument(
"--subset_hvg",
"--n_hvg_for_postp",
type=int,
default=0,
help="Determines whether to subset highly variable genes.",
Expand Down Expand Up @@ -120,7 +125,7 @@ def main():
parser.add_argument(
"--min_nnz_genes",
type=int,
default=400,
default=200,
help="Specifies the minimum non-zero genes.",
)
parser.add_argument(
Expand All @@ -139,7 +144,16 @@ def main():
help="Specifies the percentage of MT outlier.",
)
parser.add_argument(
"--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
"--batch_keys",
type=list[str],
default=[
"assay_ontology_term_id",
"self_reported_ethnicity_ontology_term_id",
"sex_ontology_term_id",
"donor_id",
"suspension_type",
],
help="Specifies the batch keys.",
)
parser.add_argument(
"--skip_validate",
Expand All @@ -150,15 +164,30 @@ def main():
parser.add_argument(
"--do_postp",
type=bool,
default=False,
default=True,
help="Determines whether to do postprocessing.",
)
parser.add_argument(
"--cache",
type=bool,
default=True,
default=False,
help="Determines whether to cache the dataset.",
)
parser.add_argument(
"--organisms",
type=list,
default=[
"NCBITaxon:9606",
"NCBITaxon:10090",
],
help="Determines the organisms to keep.",
)
parser.add_argument(
"--force_preloaded",
type=bool,
default=False,
help="Determines whether the dataset is preloaded.",
)
args = parser.parse_args()

# Load the collection
Expand All @@ -182,7 +211,7 @@ def main():
filter_gene_by_counts=args.filter_gene_by_counts,
filter_cell_by_counts=args.filter_cell_by_counts,
normalize_sum=args.normalize_sum,
subset_hvg=args.subset_hvg,
n_hvg_for_postp=args.n_hvg_for_postp,
hvg_flavor=args.hvg_flavor,
cache=args.cache,
binning=args.binning,
Expand All @@ -195,12 +224,13 @@ def main():
maxdropamount=args.maxdropamount,
madoutlier=args.madoutlier,
pct_mt_outlier=args.pct_mt_outlier,
batch_key=args.batch_key,
batch_keys=args.batch_keys,
skip_validate=args.skip_validate,
do_postp=args.do_postp,
additional_preprocess=additional_preprocess,
additional_postprocess=additional_postprocess,
keep_files=False,
force_preloaded=args.force_preloaded,
)

# Preprocess the dataset
Expand Down
8 changes: 6 additions & 2 deletions scdataloader/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
tp = []
dataset = []
nnz_loc = []
is_meta = []
for elem in batch:
organism_id = elem[self.organism_name]
if organism_id not in self.organism_ids:
Expand Down Expand Up @@ -188,12 +189,12 @@ def __call__(self, batch) -> dict[str, Tensor]:
loc = loc[self.to_subset[organism_id]]
exprs.append(expr)
gene_locs.append(loc)

if "is_meta" in elem:
is_meta.append(elem["is_meta"])
if self.tp_name is not None:
tp.append(elem[self.tp_name])
else:
tp.append(0)

other_classes.append([elem[i] for i in self.class_names])

expr = np.array(exprs)
Expand All @@ -202,6 +203,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
total_count = np.array(total_count)
other_classes = np.array(other_classes)
dataset = np.array(dataset)
is_meta = np.array(is_meta)

# normalize counts
if self.norm_to is not None:
Expand Down Expand Up @@ -229,6 +231,8 @@ def __call__(self, batch) -> dict[str, Tensor]:
"tp": Tensor(tp),
"depth": Tensor(total_count),
}
if len(is_meta) > 0:
ret.update({"is_meta": Tensor(is_meta)})
if len(dataset) > 0:
ret.update({"dataset": Tensor(dataset).to(long)})
if self.downsample is not None:
Expand Down
99 changes: 99 additions & 0 deletions scdataloader/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,102 @@
"TruDrop": "",
"Visium Spatial Gene Expression": "",
}


MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
"HsapDv:0010000": [
"MmusDv:0000092", # postnatal stage
],
"HsapDv:0000258": [ # mature stage
"MmusDv:0000110", # mature stage
"HsapDv:0000204",
],
"HsapDv:0000227": [ # late adult stage
"MmusDv:0000091", # 20 month-old stage
"MmusDv:0000089", # 18 month-old stage
],
"HsapDv:0000272": [], # 60-79 year-old stage
"HsapDv:0000095": [], # 80 year-old and over stage
"HsapDv:0000267": [ # middle aged stage
"MmusDv:0000087", # 16 month-old stage
"UBERON:0018241", # prime adult stage
"MmusDv:0000083", # 12 month-old stage
"HsapDv:0000092", # same
],
"HsapDv:0000266": [ # young adult stage
"MmusDv:0000050", # 6 weeks
"HsapDv:0000089", # same
"MmusDv:0000051", # 7 weeks
"MmusDv:0000052", # 8 weeks
"MmusDv:0000053", # 9 weeks
"MmusDv:0000054", # 10 weeks
"MmusDv:0000055", # 11 weeks
"MmusDv:0000056", # 12 weeks
"MmusDv:0000057", # 13 weeks
"MmusDv:0000058", # 14 weeks
"MmusDv:0000059", # 15 weeks
"MmusDv:0000061", # early adult stage
"MmusDv:0000062", # 2 month-old stage
"MmusDv:0000063", # 3 month-old stage
"MmusDv:0000064", # 4 month-old stage
"MmusDv:0000065", # 16 weeks
"MmusDv:0000066", # 17 weeks
"MmusDv:0000067", # 18 weeks
"MmusDv:0000068", # 19 weeks
"MmusDv:0000070", # 20 weeks
"MmusDv:0000071", # 21 weeks
"MmusDv:0000072", # 22 weeks
"MmusDv:0000073", # 23 weeks
"MmusDv:0000074", # 24 weeks
"MmusDv:0000077", # 6 month-old stage
"MmusDv:0000079", # 8 month-old stage
"MmusDv:0000098", # 25 weeks
"MmusDv:0000099", # 26 weeks
"MmusDv:0000102", # 29 weeks
],
"HsapDv:0000265": [], # child stage (1-4 yo)
"HsapDv:0000271": [ # juvenile stage (5-14 yo)
"MmusDv:0000048", # 4 weeks
"MmusDv:0000049", # 5 weeks
],
"HsapDv:0000260": [ # infant stage
"MmusDv:0000046", # 2 weeks
"MmusDv:0000045", # 1 week
"MmusDv:0000047", # 3 weeks
"HsapDv:0000083",
],
"HsapDv:0000262": [ # newborn stage (0-28 days)
"MmusDv:0000036", # Theiler stage 27
"MmusDv:0000037", # Theiler stage 28
"MmusDv:0000113", # 4-7 days
],
"HsapDv:0000007": [], # Carnegie stage 03
"HsapDv:0000008": [], # Carnegie stage 04
"HsapDv:0000009": [], # Carnegie stage 05
"HsapDv:0000003": [], # Carnegie stage 01
"HsapDv:0000005": [], # Carnegie stage 02
"HsapDv:0000010": [], # gastrula stage
"HsapDv:0000012": [], # neurula stage
"HsapDv:0000015": [ # organogenesis stage
"MmusDv:0000019", # Theiler stage 13
"MmusDv:0000020", # Theiler stage 12
"MmusDv:0000021", # Theiler stage 14
"MmusDv:0000022", # Theiler stage 15
"MmusDv:0000023", # Theiler stage 16
"MmusDv:0000024", # Theiler stage 17
"MmusDv:0000025", # Theiler stage 18
"MmusDv:0000026", # Theiler stage 19
"MmusDv:0000027", # Theiler stage 20
"MmusDv:0000028", # Theiler stage 21
"MmusDv:0000029", # Theiler stage 22
],
"HsapDv:0000037": [ # fetal stage
"MmusDv:0000033", # Theiler stage 24
"MmusDv:0000034", # Theiler stage 25
"MmusDv:0000035", # Theiler stage 26
"MmusDv:0000032", # Theiler stage 23
],
"unknown": [
"MmusDv:0000041", # unknown
],
}
Loading