Skip to content

Commit

Permalink
Merge pull request #94 from lilab-bcb/boli
Browse files Browse the repository at this point in the history
Added transpose option and fixed a bug in read_csv
  • Loading branch information
bli25 committed Jul 4, 2022
2 parents 3160e7d + 3c9469f commit 6fa9f60
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 11 deletions.
10 changes: 8 additions & 2 deletions ext_modules/io_funcs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import numpy as np

from libc.stdlib cimport malloc, free, atoi
from libc.stdio cimport fopen, fclose, getline, FILE, fscanf, sscanf, fprintf, fread, fseek, SEEK_CUR, SEEK_SET#, printf
from libc.string cimport strncmp, strlen, strtok, strcmp, memcpy, memset
from libc.string cimport strncmp, strlen, strtok, strchr, strcmp, memcpy, memset
from libc.math cimport pow

cimport cython
Expand Down Expand Up @@ -144,7 +144,13 @@ cpdef tuple read_csv(char* csv_file, char* delimiters):
assert getline(&line, &size, fi) >= 0
pch = strtok(line, delimiters)
assert pch != NULL
row_key = pch

if strchr(delimiters, line[0]) != NULL:
row_key = ""
N = 1
colnames.append(pch)
else:
row_key = pch

pch = strtok(NULL, delimiters)
while pch != NULL:
Expand Down
5 changes: 4 additions & 1 deletion pegasusio/readwrite.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def read_input(
select_data: Set[str] = None,
select_genome: Set[str] = None,
select_modality: Set[str] = None,
transpose: bool = False,
) -> MultimodalData:
"""Load data into memory.
This function is used to load input data into memory. Inputs can be in 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring' (Nanostring GeoMx spatial data) formats.
Expand All @@ -122,6 +123,8 @@ def read_input(
Only select data with genomes in select_genome. Select_data, select_genome and select_modality are mutually exclusive.
select_modality: `Set[str]`, optional (default: None)
Only select data with modalities in select_modality. Select_data, select_genome and select_modality are mutually exclusive.
transpose: `bool`, optional (default: False)
Only applicable if input type is 'csv' or 'tsv'. Need to turn it on if gene names are columns names.
Returns
-------
Expand Down Expand Up @@ -172,7 +175,7 @@ def read_input(
if is_vdj_file(input_file, file_type):
data = load_10x_vdj_file(input_file, genome = genome, modality = modality)
else:
data = load_csv_file(input_file, sep = "," if file_type == "csv" else "\t", genome = genome, modality = modality)
data = load_csv_file(input_file, sep = "," if file_type == "csv" else "\t", genome = genome, modality = modality, transpose = transpose)

data.subset_data(select_data, select_genome, select_modality)
data.kick_start()
Expand Down
24 changes: 16 additions & 8 deletions pegasusio/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ def load_csv_file(
sep: str = ",",
genome: str = None,
modality: str = None,
transpose: bool = False,
) -> MultimodalData:
"""Load count matrix from a CSV-style file, such as CSV file or DGE style tsv file.
Expand All @@ -328,6 +329,8 @@ def load_csv_file(
The genome reference. If None, use "unknown" instead.
modality: `str`, optional (default None)
Modality. If None, use "rna" instead.
transpose: `bool`, optional (default False)
If transpose the matrix. Transposation is needed if gene names are in the columns.
Returns
-------
Expand Down Expand Up @@ -382,15 +385,20 @@ def load_csv_file(
((barcode_metadata["barcodekey"].values != np.array(rownames)).sum() == 0) and ((feature_metadata["featureid"].values != np.array(colnames)).sum() == 0)
mat = csr_matrix((data, (row_ind, col_ind)), shape = shape)
else:
mat = csr_matrix((data, (col_ind, row_ind)), shape = (shape[1], shape[0]))
if barcode_metadata is None:
barcode_metadata = {"barcodekey": colnames}
if transpose:
mat = csr_matrix((data, (row_ind, col_ind)), shape = (shape[0], shape[1]))
barcode_metadata = {"barcodekey": rownames}
feature_metadata = {"featurekey": colnames}
else:
assert (barcode_metadata.shape[0] == shape[1]) and ((barcode_metadata["barcodekey"].values != np.array(colnames)).sum() == 0)
if feature_metadata is None:
feature_metadata = {"featurekey": rownames}
else:
assert (feature_metadata.shape[0] == shape[0]) and ((feature_metadata["featurekey"].values != np.array(rownames)).sum() == 0)
mat = csr_matrix((data, (col_ind, row_ind)), shape = (shape[1], shape[0]))
if barcode_metadata is None:
barcode_metadata = {"barcodekey": colnames}
else:
assert (barcode_metadata.shape[0] == shape[1]) and ((barcode_metadata["barcodekey"].values != np.array(colnames)).sum() == 0)
if feature_metadata is None:
feature_metadata = {"featurekey": rownames}
else:
assert (feature_metadata.shape[0] == shape[0]) and ((feature_metadata["featurekey"].values != np.array(rownames)).sum() == 0)

genome = genome if genome is not None else "unknown"
modality = modality if modality is not None else "rna"
Expand Down

0 comments on commit 6fa9f60

Please sign in to comment.