Skip to content

Commit

Permalink
Merge pull request #204 from lilab-bcb/1.1.57
Browse files Browse the repository at this point in the history
1.1.57
  • Loading branch information
joshua-gould committed Sep 29, 2023
2 parents 27ec140 + b5e5caa commit 6347bc1
Show file tree
Hide file tree
Showing 26 changed files with 1,408 additions and 1,189 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
CI: false
- name: Test with pytest
run: |
pip freeze
pytest
env:
CI: true
Expand Down
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ repos:
- id: docformatter
args: ["--in-place", "--wrap-summaries=100", "--wrap-descriptions=100", "--config=./pyproject.toml"]
- repo: https://github.com/psf/black
rev: 23.7.0
rev: 23.9.1
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/csachs/pyproject-flake8
rev: v6.0.0.post1
rev: v6.1.0
hooks:
- id: pyproject-flake8
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.2
rev: v3.0.3
hooks:
- id: prettier
types_or: [css, javascript]
3 changes: 2 additions & 1 deletion cirrocumulus/anndata_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import anndata
import scipy.sparse
from anndata import AnnData
from pandas import CategoricalDtype

from cirrocumulus.abstract_dataset import AbstractDataset
from cirrocumulus.anndata_util import ADATA_LAYERS_UNS_KEY, ADATA_MODULE_UNS_KEY, dataset_schema
Expand Down Expand Up @@ -130,7 +131,7 @@ def read_adata(path, filesystem, backed=False, spatial_directory=None, use_raw=F
logger.info("No spatial data found in {}".format(spatial_directory))

for field in CATEGORICAL_FIELDS_CONVERT:
if field in adata.obs and not pd.api.types.is_categorical_dtype(adata.obs[field]):
if field in adata.obs and not isinstance(adata.obs[field].dtype, CategoricalDtype):
logger.info("Converting {} to categorical".format(field))
adata.obs[field] = adata.obs[field].astype(str).astype("category")
return adata
Expand Down
5 changes: 3 additions & 2 deletions cirrocumulus/anndata_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import anndata
from pandas import CategoricalDtype


DATA_TYPE_MODULE = "module"
Expand Down Expand Up @@ -262,14 +263,14 @@ def dataset_schema(dataset, n_features=10):
val = pd.Categorical.from_codes(val[...], categories, ordered=ordered)

if (
pd.api.types.is_categorical_dtype(val)
isinstance(val.dtype, CategoricalDtype)
or pd.api.types.is_bool_dtype(val)
or pd.api.types.is_object_dtype(val)
):
obs_cat.append(key)
else:
obs.append(key)
if pd.api.types.is_categorical_dtype(val):
if isinstance(val.dtype, CategoricalDtype):
categories = val.cat.categories
if len(categories) < 100: # preserve order
category_to_order[key] = dataset.obs[key].cat.categories
Expand Down
4 changes: 2 additions & 2 deletions cirrocumulus/anndata_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
import numcodecs
from packaging import version
from pandas.api.types import is_categorical_dtype
from pandas import CategoricalDtype
from scipy import sparse


Expand Down Expand Up @@ -135,7 +135,7 @@ def write_series(group, key, series, dataset_kwargs=MappingProxyType({})):
**dataset_kwargs,
)
group[key][:] = series.values
elif is_categorical_dtype(series):
elif isinstance(series.dtype, CategoricalDtype):
# This should work for categorical Index and Series
categorical: pd.Categorical = series.values
categories: np.ndarray = categorical.categories.values
Expand Down
3 changes: 2 additions & 1 deletion cirrocumulus/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
import scipy.sparse
from pandas import CategoricalDtype

from cirrocumulus.anndata_util import ADATA_LAYERS_UNS_KEY, ADATA_MODULE_UNS_KEY
from cirrocumulus.dotplot_aggregator import DotPlotAggregator
Expand Down Expand Up @@ -315,7 +316,7 @@ def handle_data(
for key in type2measures["obs"] + dimensions:
series = adata.obs[key]
results["values"][key] = series
if pd.api.types.is_categorical_dtype(series):
if isinstance(series.dtype, CategoricalDtype):
results["values"][key] = dict(
values=series.values.codes, categories=series.cat.categories.values
)
Expand Down
4 changes: 2 additions & 2 deletions cirrocumulus/dotplot_aggregator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandas as pd
from pandas import CategoricalDtype


class DotPlotAggregator:
Expand Down Expand Up @@ -34,7 +34,7 @@ def percent_expressed(x):
else:
dimension_name = d[0]
if (
pd.api.types.is_categorical_dtype(df[dimension_name])
isinstance(df[dimension_name].dtype, CategoricalDtype)
and len(df[dimension_name].dtype.categories) <= 1
):
continue
Expand Down
4 changes: 2 additions & 2 deletions cirrocumulus/embedding_aggregator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np
import pandas as pd
from pandas import CategoricalDtype


def mean_agg(x):
Expand Down Expand Up @@ -142,7 +142,7 @@ def execute(self, df):
indices=series.values.sp_index.indices, values=series.values.sp_values
)
else:
if pd.api.types.is_categorical_dtype(series):
if isinstance(series.dtype, CategoricalDtype):
result["values"][column] = dict(
values=series.values, categories=series.cat.categories.values
)
Expand Down
5 changes: 2 additions & 3 deletions cirrocumulus/h5ad_output.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import logging

import pandas._libs.json as ujson

from cirrocumulus.anndata_util import (
ADATA_MODULE_UNS_KEY,
get_pegasus_marker_keys,
get_scanpy_marker_keys,
)
from cirrocumulus.util import dumps


logger = logging.getLogger("cirro")
Expand Down Expand Up @@ -36,7 +35,7 @@ def save_datasets_h5ad(datasets, schema, output_directory, filesystem, whitelist

sc_marker_keys = get_scanpy_marker_keys(adata)
uns_whitelist = set(["modules", "cirro-schema"])
adata.uns["cirro-schema"] = ujson.dumps(schema, double_precision=2, orient="values")
adata.uns["cirro-schema"] = dumps(schema, double_precision=2, orient="values")
for key in list(adata.uns.keys()):
if key in uns_whitelist:
continue
Expand Down
10 changes: 6 additions & 4 deletions cirrocumulus/job_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import logging

import pandas as pd
import pandas._libs.json as ujson

from cirrocumulus.diff_exp import DE
from cirrocumulus.ot.transport_map_model import read_transport_map_dir
from cirrocumulus.util import dumps

from .data_processing import get_filter_str, get_mask, get_selected_data
from .envir import (
Expand All @@ -33,7 +33,7 @@ def save_job_result_to_file(result, job_id):
new_result["content-encoding"] = "gzip"
url = os.path.join(os.environ[CIRRO_JOB_RESULTS], str(job_id) + ".json.gz")
with open_file(url, "wt", compression="gzip") as out:
out.write(ujson.dumps(result, double_precision=2, orient="values"))
out.write(dumps(result, double_precision=2, orient="values"))
elif new_result["content-type"] == "application/h5ad":
url = os.path.join(os.environ[CIRRO_JOB_RESULTS], str(job_id) + ".h5ad")
with get_fs(url).open(url, "wb") as out:
Expand Down Expand Up @@ -138,12 +138,14 @@ def get_obs(dataset_api, dataset, dataset_info, params):
if filter_names[i] is None:
filter_names[i] = "group_" + str(i + 1)
obs = pd.DataFrame(index=pd.RangeIndex(dataset_info["shape"][0]).astype(str))
obs_field = "selection"
obs_field = "selection" # order of categories needs to match filter names
# obs[obs_field] = "3"
masks, _ = get_mask(dataset_api, dataset, dataset_info, filters)
for i in range(2):
obs.loc[masks[i], obs_field] = filter_names[i]
obs[obs_field] = obs[obs_field].astype("category")
obs[obs_field] = (
obs[obs_field].astype("category").cat.as_ordered().cat.reorder_categories(filter_names)
)
return obs, obs_field


Expand Down
7 changes: 4 additions & 3 deletions cirrocumulus/json_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import numpy as np
import scipy.sparse
import pandas._libs.json as ujson

from cirrocumulus.util import dumps


logger = logging.getLogger("cirro")
Expand All @@ -12,7 +13,7 @@
def write_json(d, output_dir, name):
os.makedirs(output_dir, exist_ok=True)
with open(output_dir + os.path.sep + name + ".json", "wt") as f:
c = ujson.dumps(d, double_precision=2, orient="values")
c = dumps(d, double_precision=2, orient="values")
f.write(c)


Expand All @@ -21,7 +22,7 @@ def save_adata_json(adata, schema, output_directory):
os.makedirs(output_directory, exist_ok=True)
with open(os.path.join(output_directory, "schema.json"), "wt") as f:
# json.dump(result, f)
f.write(ujson.dumps(schema, double_precision=2, orient="values"))
f.write(dumps(schema, double_precision=2, orient="values"))

save_adata_X(adata, output_directory)
save_data_obs(adata, output_directory)
Expand Down
10 changes: 6 additions & 4 deletions cirrocumulus/jsonl_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import numpy as np
import pandas as pd
import scipy.sparse
import pandas._libs.json as ujson
from pandas import CategoricalDtype

from cirrocumulus.util import dumps


logger = logging.getLogger("cirro")
Expand All @@ -17,7 +19,7 @@
def write_jsonl(d, f, name, index, compress=False):
output = {}
output[name] = d
c = ujson.dumps(output, double_precision=2, orient="values").encode("UTF-8")
c = dumps(output, double_precision=2, orient="values").encode("UTF-8")
if compress:
c = gzip.compress(c)
start = f.tell()
Expand Down Expand Up @@ -78,7 +80,7 @@ def save_dataset_jsonl(dataset, schema, output_dir, base_name, filesystem):
) as f: # save index
# json.dump(result, f)
result = dict(index=index, file=os.path.basename(jsonl_path))
f.write(ujson.dumps(result, double_precision=2, orient="values"))
f.write(dumps(result, double_precision=2, orient="values"))


def save_adata_X(adata, f, index, compress, layer=None):
Expand Down Expand Up @@ -126,7 +128,7 @@ def save_data_obs(adata, f, index, compress):
for name in adata.obs:
series = adata.obs[name]
value = series
if pd.api.types.is_categorical_dtype(series):
if isinstance(series.dtype, CategoricalDtype):
value = dict(values=series.values.codes, categories=series.cat.categories.values)
write_jsonl(value, f, name, index, compress)
write_jsonl(adata.obs.index.values, f, "index", index, compress)
5 changes: 2 additions & 3 deletions cirrocumulus/mongo_db.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import os
import datetime

import pandas._libs.json as ujson
from bson import ObjectId
from pymongo import MongoClient

from cirrocumulus.abstract_db import AbstractDB
from cirrocumulus.util import get_email_domain, get_fs
from cirrocumulus.util import dumps, get_email_domain, get_fs

from .envir import (
CIRRO_AUTH_CLIENT_ID,
Expand Down Expand Up @@ -349,7 +348,7 @@ def update_job(self, email, job_id, status, result):
if os.environ.get(CIRRO_JOB_RESULTS) is not None: # save to directory
result = save_job_result_to_file(result, job_id)
else:
result = ujson.dumps(result, double_precision=2, orient="values")
result = dumps(result, double_precision=2, orient="values")
result = str(self.get_gridfs().put(result, encoding="ascii"))

collection.update_one(
Expand Down
5 changes: 3 additions & 2 deletions cirrocumulus/parquet_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import pyarrow as pa
import scipy.sparse
import pyarrow.parquet as pq
import pandas._libs.json as ujson

from cirrocumulus.util import dumps


logger = logging.getLogger("cirro")
Expand Down Expand Up @@ -33,7 +34,7 @@ def save_dataset_pq(dataset, schema, output_directory, filesystem, whitelist):
with filesystem.open(
os.path.join(output_directory, "index.json.gz"), "wt", compression="gzip"
) as f:
f.write(ujson.dumps(schema, double_precision=2, orient="values"))
f.write(dumps(schema, double_precision=2, orient="values"))
if whitelist["x"]:
save_adata_X(dataset, X_dir, filesystem, whitelist=whitelist["x_keys"])
for layer in dataset.layers.keys():
Expand Down
5 changes: 3 additions & 2 deletions cirrocumulus/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd
import anndata
import scipy.sparse
from pandas import CategoricalDtype

from cirrocumulus.anndata_dataset import read_adata
from cirrocumulus.anndata_util import dataset_schema, get_scanpy_marker_keys
Expand Down Expand Up @@ -127,7 +128,7 @@ def __init__(
if pd.api.types.is_object_dtype(c):
dataset.obs[name] = dataset.obs[name].astype("category")
c = dataset.obs[name]
if not dimensions_supplied and pd.api.types.is_categorical_dtype(c):
if not dimensions_supplied and isinstance(c.dtype, CategoricalDtype):
if 1 < len(c.cat.categories) < 2000:
self.dimensions.append(name)
if c.isna().sum() > 0:
Expand Down Expand Up @@ -202,7 +203,7 @@ def execute(self):
)

if field in dataset.obs:
if not pd.api.types.is_categorical_dtype(dataset.obs[field]):
if not isinstance(dataset.obs[field].dtype, CategoricalDtype):
dataset.obs[field] = dataset.obs[field].astype(str).astype("category")
if len(dataset.obs[field].cat.categories) > 1:
key_added = "rank_genes_" + str(field)
Expand Down
16 changes: 14 additions & 2 deletions cirrocumulus/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np
import scipy.sparse as ss
from anndata.compat import Index1D
from scipy.sparse import _sparsetools


Expand All @@ -22,7 +23,18 @@
except ImportError:
_cs_matrix = ss.spmatrix

from anndata._core.index import Index, _subset, unpack_index
from anndata._core.index import Index, _subset


def unpack_index(index: Index) -> tuple[Index1D, Index1D]:
if not isinstance(index, tuple):
return index, slice(None)
elif len(index) == 2:
return index
elif len(index) == 1:
return index[0], slice(None)
else:
raise IndexError("invalid number of indices")


class BackedFormat(NamedTuple):
Expand Down Expand Up @@ -272,7 +284,7 @@ def __setitem__(self, index: Union[Index, Tuple[()]], value):
mock_matrix[row, col] = value

def _normalize_index(self, index: Union[Index, Tuple[()]]) -> Tuple[np.ndarray, np.ndarray]:
if index == ():
if isinstance(index, tuple) and len(index) == 0:
index = slice(None)
row, col = unpack_index(index)
if all(isinstance(x, cabc.Iterable) for x in (row, col)):
Expand Down

0 comments on commit 6347bc1

Please sign in to comment.