np.issubdtype is unreliable under Windows, using .dtype.kind instead

internaut · Apr 18, 2023 · de2e2db · de2e2db
1 parent 36bb689
commit de2e2db
Show file tree

Hide file tree

Showing 7 changed files with 33 additions and 33 deletions.
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -502,10 +502,9 @@ def test_doc_tokens_hypothesis(corpora_en_serial_and_parallel_module, **args):
 
                     for v in res.values():
                         if len(v) > 0:
-                            assert np.issubdtype(v['token'].dtype,
-                                                 np.uint64 if args['tokens_as_hashes'] else np.dtype('O'))
+                            assert v['token'].dtype.kind == ('u' if args['tokens_as_hashes'] else 'O')
                             if args['sentences']:
-                                assert np.issubdtype(v['sent'].dtype, 'int')
+                                assert v['sent'].dtype.kind == 'i'
                                 assert np.min(v['sent']) == 0
 
                     res_tokens = {}
@@ -986,7 +985,7 @@ def test_vocabulary_hypothesis(corpora_en_serial_and_parallel_module, select, by
                     assert all(t in corp_flat for t in res)
 
                 if not convert_uint64hashes and tokens_as_hashes and select is None:
-                    assert all([np.issubdtype(t.dtype, 'uint64') for t in res])
+                    assert all(t.dtype.kind == 'u' for t in res)
                 else:
                     if tokens_as_hashes:
                         expect_type = int
@@ -1028,7 +1027,7 @@ def test_vocabulary_counts(corpora_en_serial_and_parallel_module, select, by_att
                         assert len(res) > 0
 
                     if not convert_uint64hashes and tokens_as_hashes:
-                        assert all([np.issubdtype(t.dtype, 'uint64') for t in res.keys()])
+                        assert all(t.dtype.kind == 'u' for t in res.keys())
                     else:
                         if tokens_as_hashes:
                             expect_type = int
@@ -1708,7 +1707,7 @@ def test_kwic_table_hypothesis(corpora_en_serial_and_parallel_module, **args):
                     assert contexts == list(range(len(dkwic)))
 
                 if len(dkwic) > 0:
-                    assert np.issubdtype(dkwic[matchattr], object)
+                    assert dkwic[matchattr].dtype.kind == 'O'
 
                     if args['glue'] is None:
                         assert np.all(0 <= dkwic['position'])
@@ -1888,7 +1887,7 @@ def test_token_cooccurrence_hypothesis(corpora_en_serial_and_parallel_module, **
                 if not (args['as_table'] and len(cooc) == 0):
                     # pandas doesn't respect the dtype when creating an empty dataframe (it's always float64 in that
                     # case)
-                    assert np.issubdtype(cooc.dtype, args['dtype'])
+                    assert cooc.dtype.kind == args['dtype'][:1]
 
                 # shape
                 assert cooc.shape == (n_tok, n_tok)
@@ -1992,7 +1991,7 @@ def test_token_cooccurrence_matrix_example(context_size, sparse_mat, triu):
         else:
             assert isinstance(cooc, np.ndarray)
 
-        assert np.issubdtype(cooc.dtype, 'int32')
+        assert cooc.dtype.kind == 'i'
 
         if triu:
             assert np.all(cooc == np.triu(expected))
@@ -3672,7 +3671,7 @@ def test__token_cooccurrence_matrix(docs, context_size, tokens, tokens_oov, spar
             assert isinstance(cooc, np.ndarray)
 
         # matrix dtype
-        assert np.issubdtype(cooc.dtype, dtype)
+        assert cooc.dtype.kind == dtype[:1]
 
         # matrix shape
         if corpussize == 0:
@@ -3798,7 +3797,7 @@ def _check_corpus_docs(corp: c.Corpus, has_sents: bool):
         assert d.bimaps is corp.bimaps
         assert isinstance(d.tokenmat, np.ndarray)
         assert d.tokenmat.ndim == 2
-        assert np.issubdtype(d.tokenmat.dtype, 'uint64')
+        assert d.tokenmat.dtype.kind == 'u'
         assert len(d) >= 0
         assert len(d) == len(d.tokenmat)
         assert isinstance(d.tokenmat_attrs, list)

diff --git a/tests/test_tokenseq.py b/tests/test_tokenseq.py
@@ -64,19 +64,20 @@ def _test_pad_sequence(s, s_type, el_type, left, right, left_symbol, right_symbo
         assert isinstance(spad, check_type)
 
         if s_type == 'nparray':
-            assert np.issubdtype(spad.dtype, el_type)
-
             if el_type == 'int':
-                el_type_check = np.int64
+                el_type_check = 'i'
             else:
-                el_type_check = str
+                el_type_check = 'U'   # unicode char
+
+            assert spad.dtype.kind == el_type_check
+            assert all(t.dtype.kind == el_type_check for t in list(spad))
         else:
             if el_type == 'int':
                 el_type_check = int
             else:
                 el_type_check = str
 
-        assert all(isinstance(t, el_type_check) for t in list(spad))
+            assert all(isinstance(t, el_type_check) for t in list(spad))
 
         assert len(spad) >= len(s)
 
@@ -213,7 +214,7 @@ def test_token_hash_convert(tokens, tokens_as_hashes, tokens_as_array, special_t
 
         if tokens_as_hashes:
             if tokens_as_array:
-                assert np.issubdtype(res.dtype, 'str')
+                assert res.dtype.kind == 'U'
             else:
                 assert all(isinstance(t, str) for t in res)
 
@@ -444,7 +445,7 @@ def test_token_collocation_matrix_hypothesis(sentences, min_count, pass_embed_to
             mat = res
 
         assert isinstance(mat, sparse.csr_matrix)
-        assert np.issubdtype(mat.dtype, 'uint32')
+        assert mat.dtype.kind == 'u'
 
         if len(tok) < 2:
             assert mat.nnz == 0

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -130,7 +130,7 @@ def test_empty_chararray():
     assert isinstance(res, np.ndarray)
     assert len(res) == 0
     assert res.ndim == 1
-    assert np.issubdtype(res.dtype, 'str')
+    assert res.dtype.kind == 'U'
 
 
 @given(x=st.lists(st.integers()),
@@ -144,7 +144,7 @@ def test_as_chararray(x, as_numpy_array):
     assert isinstance(res, np.ndarray)
     assert len(res) == len(x)
     assert res.ndim == 1
-    assert np.issubdtype(res.dtype, 'str')
+    assert res.dtype.kind == 'U'
     assert res.tolist() == list(map(str, x_orig))
 
 
@@ -265,10 +265,10 @@ def test_pairwise_max_table(m, pass_sparse, labels, output_columns, sort, sort_a
             assert np.all(res_vals > 0)
 
         if labels:
-            assert np.issubdtype(res_lbls.dtype, 'str') or np.issubdtype(res_lbls.dtype, object)
+            assert res_lbls.dtype.kind in {'U', 'O'}
             assert np.all(lbl in args['labels'] for lbl in res_lbls.flatten())
         else:
-            assert np.issubdtype(res_lbls.dtype, 'int')
+            assert res_lbls.dtype.kind == 'i'
             assert np.all(res_lbls >= 0)
             assert np.all(res_lbls < n)
 
@@ -746,9 +746,9 @@ def test_mat_rinterop(m, to_int, pass_dimnames, return_dimnames):
 
     if to_int:
         assert isinstance(rm, robjects.vectors.IntMatrix)
-        assert np.issubdtype(m_.dtype, 'int32') or np.issubdtype(m_.dtype, 'int64')
+        assert m_.dtype.kind == 'i'
     else:
         assert isinstance(rm, robjects.vectors.FloatMatrix)
-        assert np.issubdtype(m_.dtype, 'float32') or np.issubdtype(m_.dtype, 'float64')
+        assert m_.dtype.kind == 'f'
 
     assert np.allclose(m, m_)
diff --git a/tmtoolkit/corpus/_corpusfuncs.py b/tmtoolkit/corpus/_corpusfuncs.py
@@ -1624,18 +1624,18 @@ def token_cooccurrence(docs: Corpus,
             return empty_res
 
     if token_hashes is None:
-        if isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'uint64'):
+        if isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'u':
             token_hashes = tokens
         else:
-            if (isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'str')) \
+            if (isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'U') \
                     or isinstance(next(iter(tokens)), str):
                 # list or NumPy array of strings
                 token_hashes = np.array(token_hash_convert(tokens, stringstore=bimap_attr.inv), dtype='uint64')
             else:   # list or NumPy array of int hashes
                 token_hashes = np.array(tokens, dtype='uint64')
 
     assert n_tok == len(token_hashes)
-    assert isinstance(token_hashes, np.ndarray) and np.issubdtype(token_hashes.dtype, 'uint64')
+    assert isinstance(token_hashes, np.ndarray) and token_hashes.dtype.kind == 'u'
 
     @parallelexec(collect_fn=merge_dicts if per_document else list)
     def _parallel_token_cooc(chunk):
@@ -2767,7 +2767,7 @@ def filter_tokens_by_mask(docs: Corpus, /, mask: Dict[str, Union[List[bool], np.
 
         if not isinstance(m, np.ndarray):
             m = np.array(m, dtype=bool)
-        elif not np.issubdtype(m.dtype, bool):
+        elif m.dtype.kind != 'b':
             m = m.astype(bool)
 
         if inverse:
@@ -3917,7 +3917,7 @@ def _build_kwic_parallel(docs, search_tokens, context_size, by_attr, match_type,
             docdata = docs[lbl]
             tok_arr = docdata.pop('_matchagainst')
 
-            if not isinstance(tok_arr, np.ndarray) or not np.issubdtype(tok_arr.dtype, str):
+            if not isinstance(tok_arr, np.ndarray) or tok_arr.dtype.kind != 'U':
                 assert isinstance(tok_arr, (list, tuple, np.ndarray))
                 tok_arr = as_chararray(tok_arr)
 
@@ -4144,7 +4144,7 @@ def _token_cooccurrence_matrix(docs: Sequence[Union[List[StrOrInt], np.ndarray]]
     if len(set(tokens)) != len(tokens):
         raise ValueError('`tokens` shall not contain duplicate elements')
 
-    as_hashes = (isinstance(tokens, np.ndarray) and np.issubdtype(tokens.dtype, 'uint64')) or \
+    as_hashes = (isinstance(tokens, np.ndarray) and tokens.dtype.kind == 'u') or \
                 isinstance(next(iter(tokens)), int)
 
     if as_hashes:

diff --git a/tmtoolkit/corpus/_document.py b/tmtoolkit/corpus/_document.py
@@ -431,7 +431,7 @@ def uint64arr_from_strings(attr, strings):
     def values_as_uint64arr(attr, val):
         """Helper function that tries to convert `val` to an array of hashes, depending on the type of `val`."""
         if isinstance(val, np.ndarray):
-            if np.issubdtype(val.dtype, str):    # this is an array of strings -> convert to hashes
+            if val.dtype.kind == 'U':    # this is an array of strings -> convert to hashes
                 return uint64arr_from_strings(attr, val.tolist())
             else:
                 return val.astype('uint64')

diff --git a/tmtoolkit/tokenseq/__init__.py b/tmtoolkit/tokenseq/__init__.py
@@ -52,7 +52,7 @@ def pad_sequence(s: Union[Tuple[StrOrInt, ...], List[StrOrInt], np.ndarray], lef
     if isinstance(s, tuple):
         return tuple(prepend) + s + tuple(append)
     elif isinstance(s, np.ndarray):
-        if np.issubdtype(s.dtype, 'str'):
+        if s.dtype.kind == 'U':
             to_dtype = 'str'
         else:
             to_dtype = s.dtype

diff --git a/tmtoolkit/utils.py b/tmtoolkit/utils.py
@@ -210,7 +210,7 @@ def as_chararray(x: Union[np.ndarray, Sequence]) -> np.ndarray:
     """
     if len(x) > 0:
         if isinstance(x, np.ndarray):
-            if np.issubdtype(x.dtype, str):
+            if x.dtype.kind == 'U':
                 return x.copy()
             else:
                 return x.astype(str)
@@ -232,7 +232,7 @@ def chararray_elem_size(x: np.ndarray) -> int:
     :param x: NumPy unicode character array
     :return: reserved size of each element
     """
-    if isinstance(x, np.ndarray) and np.issubdtype(x.dtype, 'U'):
+    if isinstance(x, np.ndarray) and x.dtype.kind == 'U':
         return x.itemsize // numpy_unicode_bytes
     else:
         raise ValueError('`x` must be a NumPy unicode character array')