Skip to content

Commit

Permalink
np.issubdtype is unreliable under Windows, using .dtype.kind instead
Browse files Browse the repository at this point in the history
  • Loading branch information
internaut committed Apr 18, 2023
1 parent 36bb689 commit de2e2db
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 33 deletions.
19 changes: 9 additions & 10 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,10 +502,9 @@ def test_doc_tokens_hypothesis(corpora_en_serial_and_parallel_module, **args):

for v in res.values():
if len(v) > 0:
assert np.issubdtype(v['token'].dtype,
np.uint64 if args['tokens_as_hashes'] else np.dtype('O'))
assert v['token'].dtype.kind == ('u' if args['tokens_as_hashes'] else 'O')
if args['sentences']:
assert np.issubdtype(v['sent'].dtype, 'int')
assert v['sent'].dtype.kind == 'i'
assert np.min(v['sent']) == 0

res_tokens = {}
Expand Down Expand Up @@ -986,7 +985,7 @@ def test_vocabulary_hypothesis(corpora_en_serial_and_parallel_module, select, by
assert all(t in corp_flat for t in res)

if not convert_uint64hashes and tokens_as_hashes and select is None:
assert all([np.issubdtype(t.dtype, 'uint64') for t in res])
assert all(t.dtype.kind == 'u' for t in res)
else:
if tokens_as_hashes:
expect_type = int
Expand Down Expand Up @@ -1028,7 +1027,7 @@ def test_vocabulary_counts(corpora_en_serial_and_parallel_module, select, by_att
assert len(res) > 0

if not convert_uint64hashes and tokens_as_hashes:
assert all([np.issubdtype(t.dtype, 'uint64') for t in res.keys()])
assert all(t.dtype.kind == 'u' for t in res.keys())
else:
if tokens_as_hashes:
expect_type = int
Expand Down Expand Up @@ -1708,7 +1707,7 @@ def test_kwic_table_hypothesis(corpora_en_serial_and_parallel_module, **args):
assert contexts == list(range(len(dkwic)))

if len(dkwic) > 0:
assert np.issubdtype(dkwic[matchattr], object)
assert dkwic[matchattr].dtype.kind == 'O'

if args['glue'] is None:
assert np.all(0 <= dkwic['position'])
Expand Down Expand Up @@ -1888,7 +1887,7 @@ def test_token_cooccurrence_hypothesis(corpora_en_serial_and_parallel_module, **
if not (args['as_table'] and len(cooc) == 0):
# pandas doesn't respect the dtype when creating an empty dataframe (it's always float64 in that
# case)
assert np.issubdtype(cooc.dtype, args['dtype'])
assert cooc.dtype.kind == args['dtype'][:1]

# shape
assert cooc.shape == (n_tok, n_tok)
Expand Down Expand Up @@ -1992,7 +1991,7 @@ def test_token_cooccurrence_matrix_example(context_size, sparse_mat, triu):
else:
assert isinstance(cooc, np.ndarray)

assert np.issubdtype(cooc.dtype, 'int32')
assert cooc.dtype.kind == 'i'

if triu:
assert np.all(cooc == np.triu(expected))
Expand Down Expand Up @@ -3672,7 +3671,7 @@ def test__token_cooccurrence_matrix(docs, context_size, tokens, tokens_oov, spar
assert isinstance(cooc, np.ndarray)

# matrix dtype
assert np.issubdtype(cooc.dtype, dtype)
assert cooc.dtype.kind == dtype[:1]

# matrix shape
if corpussize == 0:
Expand Down Expand Up @@ -3798,7 +3797,7 @@ def _check_corpus_docs(corp: c.Corpus, has_sents: bool):
assert d.bimaps is corp.bimaps
assert isinstance(d.tokenmat, np.ndarray)
assert d.tokenmat.ndim == 2
assert np.issubdtype(d.tokenmat.dtype, 'uint64')
assert d.tokenmat.dtype.kind == 'u'
assert len(d) >= 0
assert len(d) == len(d.tokenmat)
assert isinstance(d.tokenmat_attrs, list)
Expand Down
15 changes: 8 additions & 7 deletions tests/test_tokenseq.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,19 +64,20 @@ def _test_pad_sequence(s, s_type, el_type, left, right, left_symbol, right_symbo
assert isinstance(spad, check_type)

if s_type == 'nparray':
assert np.issubdtype(spad.dtype, el_type)

if el_type == 'int':
el_type_check = np.int64
el_type_check = 'i'
else:
el_type_check = str
el_type_check = 'U' # unicode char

assert spad.dtype.kind == el_type_check
assert all(t.dtype.kind == el_type_check for t in list(spad))
else:
if el_type == 'int':
el_type_check = int
else:
el_type_check = str

assert all(isinstance(t, el_type_check) for t in list(spad))
assert all(isinstance(t, el_type_check) for t in list(spad))

assert len(spad) >= len(s)

Expand Down Expand Up @@ -213,7 +214,7 @@ def test_token_hash_convert(tokens, tokens_as_hashes, tokens_as_array, special_t

if tokens_as_hashes:
if tokens_as_array:
assert np.issubdtype(res.dtype, 'str')
assert res.dtype.kind == 'U'
else:
assert all(isinstance(t, str) for t in res)

Expand Down Expand Up @@ -444,7 +445,7 @@ def test_token_collocation_matrix_hypothesis(sentences, min_count, pass_embed_to
mat = res

assert isinstance(mat, sparse.csr_matrix)
assert np.issubdtype(mat.dtype, 'uint32')
assert mat.dtype.kind == 'u'

if len(tok) < 2:
assert mat.nnz == 0
Expand Down
12 changes: 6 additions & 6 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_empty_chararray():
assert isinstance(res, np.ndarray)
assert len(res) == 0
assert res.ndim == 1
assert np.issubdtype(res.dtype, 'str')
assert res.dtype.kind == 'U'


@given(x=st.lists(st.integers()),
Expand All @@ -144,7 +144,7 @@ def test_as_chararray(x, as_numpy_array):
assert isinstance(res, np.ndarray)
assert len(res) == len(x)
assert res.ndim == 1
assert np.issubdtype(res.dtype, 'str')
assert res.dtype.kind == 'U'
assert res.tolist() == list(map(str, x_orig))


Expand Down Expand Up @@ -265,10 +265,10 @@ def test_pairwise_max_table(m, pass_sparse, labels, output_columns, sort, sort_a
assert np.all(res_vals > 0)

if labels:
assert np.issubdtype(res_lbls.dtype, 'str') or np.issubdtype(res_lbls.dtype, object)
assert res_lbls.dtype.kind in {'U', 'O'}
assert np.all(lbl in args['labels'] for lbl in res_lbls.flatten())
else:
assert np.issubdtype(res_lbls.dtype, 'int')
assert res_lbls.dtype.kind == 'i'
assert np.all(res_lbls >= 0)
assert np.all(res_lbls < n)

Expand Down Expand Up @@ -746,9 +746,9 @@ def test_mat_rinterop(m, to_int, pass_dimnames, return_dimnames):

if to_int:
assert isinstance(rm, robjects.vectors.IntMatrix)
assert np.issubdtype(m_.dtype, 'int32') or np.issubdtype(m_.dtype, 'int64')
assert m_.dtype.kind == 'i'
else:
assert isinstance(rm, robjects.vectors.FloatMatrix)
assert np.issubdtype(m_.dtype, 'float32') or np.issubdtype(m_.dtype, 'float64')
assert m_.dtype.kind == 'f'

assert np.allclose(m, m_)
12 changes: 6 additions & 6 deletions tmtoolkit/corpus/_corpusfuncs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1624,18 +1624,18 @@ def token_cooccurrence(docs: Corpus,
return empty_res

if token_hashes is None:
if isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'uint64'):
if isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'u':
token_hashes = tokens
else:
if (isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'str')) \
if (isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'U') \
or isinstance(next(iter(tokens)), str):
# list or NumPy array of strings
token_hashes = np.array(token_hash_convert(tokens, stringstore=bimap_attr.inv), dtype='uint64')
else: # list or NumPy array of int hashes
token_hashes = np.array(tokens, dtype='uint64')

assert n_tok == len(token_hashes)
assert isinstance(token_hashes, np.ndarray) and np.issubdtype(token_hashes.dtype, 'uint64')
assert isinstance(token_hashes, np.ndarray) and token_hashes.dtype.kind == 'u'

@parallelexec(collect_fn=merge_dicts if per_document else list)
def _parallel_token_cooc(chunk):
Expand Down Expand Up @@ -2767,7 +2767,7 @@ def filter_tokens_by_mask(docs: Corpus, /, mask: Dict[str, Union[List[bool], np.

if not isinstance(m, np.ndarray):
m = np.array(m, dtype=bool)
elif not np.issubdtype(m.dtype, bool):
elif m.dtype.kind != 'b':
m = m.astype(bool)

if inverse:
Expand Down Expand Up @@ -3917,7 +3917,7 @@ def _build_kwic_parallel(docs, search_tokens, context_size, by_attr, match_type,
docdata = docs[lbl]
tok_arr = docdata.pop('_matchagainst')

if not isinstance(tok_arr, np.ndarray) or not np.issubdtype(tok_arr.dtype, str):
if not isinstance(tok_arr, np.ndarray) or tok_arr.dtype.kind != 'U':
assert isinstance(tok_arr, (list, tuple, np.ndarray))
tok_arr = as_chararray(tok_arr)

Expand Down Expand Up @@ -4144,7 +4144,7 @@ def _token_cooccurrence_matrix(docs: Sequence[Union[List[StrOrInt], np.ndarray]]
if len(set(tokens)) != len(tokens):
raise ValueError('`tokens` shall not contain duplicate elements')

as_hashes = (isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'uint64')) or \
as_hashes = (isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'u') or \
isinstance(next(iter(tokens)), int)

if as_hashes:
Expand Down
2 changes: 1 addition & 1 deletion tmtoolkit/corpus/_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def uint64arr_from_strings(attr, strings):
def values_as_uint64arr(attr, val):
"""Helper function that tries to convert `val` to an array of hashes, depending on the type of `val`."""
if isinstance(val, np.ndarray):
if np.issubdtype(val.dtype, str): # this is an array of strings -> convert to hashes
if val.dtype.kind == 'U': # this is an array of strings -> convert to hashes
return uint64arr_from_strings(attr, val.tolist())
else:
return val.astype('uint64')
Expand Down
2 changes: 1 addition & 1 deletion tmtoolkit/tokenseq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def pad_sequence(s: Union[Tuple[StrOrInt, ...], List[StrOrInt], np.ndarray], lef
if isinstance(s, tuple):
return tuple(prepend) + s + tuple(append)
elif isinstance(s, np.ndarray):
if np.issubdtype(s.dtype, 'str'):
if s.dtype.kind == 'U':
to_dtype = 'str'
else:
to_dtype = s.dtype
Expand Down
4 changes: 2 additions & 2 deletions tmtoolkit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def as_chararray(x: Union[np.ndarray, Sequence]) -> np.ndarray:
"""
if len(x) > 0:
if isinstance(x, np.ndarray):
if np.issubdtype(x.dtype, str):
if x.dtype.kind == 'U':
return x.copy()
else:
return x.astype(str)
Expand All @@ -232,7 +232,7 @@ def chararray_elem_size(x: np.ndarray) -> int:
:param x: NumPy unicode character array
:return: reserved size of each element
"""
if isinstance(x, np.ndarray) and np.issubdtype(x.dtype, 'U'):
if isinstance(x, np.ndarray) and x.dtype.kind == 'U':
return x.itemsize // numpy_unicode_bytes
else:
raise ValueError('`x` must be a NumPy unicode character array')
Expand Down

0 comments on commit de2e2db

Please sign in to comment.