Merge from master 2021_w8 (IntelPython#962)

* Adds Int64Index type and updates Series and DF methods to use it (IntelPython#950) * Adds Int64Index type and updates Series and DF methods to use it Motivation: as part of the work on supporting common pandas indexes a new type (Int64IndexType) representing pandas.Int64Index is added. Boxing/unboxing of Series and DataFrames as well as common numpy-like functions are changed accordingly to handle it. * Fixing DateTime tests and PEP remarks * Fixing review comments #1 * Move to Numba 0.52 (IntelPython#939) * Taking numba from master * Moving to Numba 0.52 commit 3182540b127268ace11cf4042cd87f044875d9fa Author: Kozlov, Alexey <alexey.kozlov@intel.com> Date: Wed Oct 21 19:49:58 2020 +0300 Cleaning up before squash commit 895668116542fe3057f73fcb276c441cbde66747 Author: Kozlov, Alexey <alexey.kozlov@intel.com> Date: Tue Oct 13 17:31:34 2020 +0300 Workaround for set from str_arr problem * Fixing correct NUMBA_VERSION * Remove intel/label/beta channel from Azure CI builds * Move to pandas=1.2.0 (IntelPython#959) * Move to pandas=1.2.0 Motivation: use latest versions of dependencies. * More failed tests are fixed * Fixing doc build * Fixing bug in stability of mergesort impl for StringArray (IntelPython#961) Motivation: for StringArray type legacy implementation of stable sort computed result when sorting with ascending=False by reversing the result of argsorting with ascending=True, which produces wrong order in groups of elements with the same value. Implemented solution adds new function argument 'ascening' and uses it when calling native function impl via serial stable_sort.
kozlov-alexey · Feb 19, 2021 · 5ce3841 · 5ce3841
1 parent d32e9ec
commit 5ce3841
Show file tree

Hide file tree

Showing 41 changed files with 2,021 additions and 449 deletions.
diff --git a/README.rst b/README.rst
@@ -34,13 +34,13 @@ Distribution includes Intel® SDC for Python 3.6 and Python 3.7 for Windows and
 
 Intel® SDC conda package can be installed using the steps below::
 
-    > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge
+    > conda create -n sdc-env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge
     > conda activate sdc-env
     > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels
 
 Intel® SDC wheel package can be installed using the steps below::
 
-    > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.0.5 -c anaconda -c conda-forge
+    > conda create -n sdc-env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge
     > conda activate sdc-env
     > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc
 
@@ -82,7 +82,7 @@ Building on Linux with setuptools
 
     export PYVER=<3.6 or 3.7>
     export NUMPYVER=<1.16 or 1.17>
-    conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64
+    conda create -n sdc-env -q -y -c intel/label/beta -c defaults -c intel -c conda-forge python=$PYVER numpy=$NUMPYVER tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0 gcc_linux-64 gxx_linux-64
     source activate sdc-env
     git clone https://github.com/IntelPython/sdc.git
     cd sdc
@@ -120,7 +120,7 @@ Building on Windows with setuptools
 
     set PYVER=<3.6 or 3.7>
     set NUMPYVER=<1.16 or 1.17>
-    conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.49 pandas=1.0.5 pyarrow=0.17.0
+    conda create -n sdc-env -c intel/label/beta -c defaults -c intel -c conda-forge python=%PYVER% numpy=%NUMPYVER% tbb-devel tbb4py numba=0.52 pandas=1.2.0 pyarrow=0.17.0
     conda activate sdc-env
     set INCLUDE=%INCLUDE%;%CONDA_PREFIX%\Library\include
     set LIB=%LIB%;%CONDA_PREFIX%\Library\lib

diff --git a/buildscripts/utilities.py b/buildscripts/utilities.py
@@ -52,7 +52,7 @@ def __init__(self, python, sdc_local_channel=None):
         self.line_single = '-'*80
 
         # Set channels
-        self.channel_list = ['-c', 'intel/label/beta', '-c', 'defaults', '-c', 'conda-forge']
+        self.channel_list = ['-c', 'defaults', '-c', 'conda-forge']
         if sdc_local_channel:
             sdc_local_channel = Path(sdc_local_channel).resolve().as_uri()
             self.channel_list = ['-c', sdc_local_channel] + self.channel_list

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,5 +1,5 @@
-{% set NUMBA_VERSION = "==0.51.2" %}
-{% set PANDAS_VERSION = "==1.0.5" %}
+{% set NUMBA_VERSION = "==0.52.0" %}
+{% set PANDAS_VERSION = "==1.2.0" %}
 {% set PYARROW_VERSION = "==0.17.0" %}
 
 package:

diff --git a/docs/source/_templates/_api_ref.pandas.window_templ.rst b/docs/source/_templates/_api_ref.pandas.window_templ.rst
@@ -51,8 +51,8 @@ Exponentially-weighted moving window functions
 ----------------------------------------------
 
 .. sdc_toctree
-   EWM.mean
-   EWM.std
-   EWM.var
-   EWM.corr
-   EWM.cov
+   ewm.ExponentialMovingWindow.mean
+   ewm.ExponentialMovingWindow.std
+   ewm.ExponentialMovingWindow.var
+   ewm.ExponentialMovingWindow.corr
+   ewm.ExponentialMovingWindow.cov
diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst
@@ -41,14 +41,14 @@ Distribution includes Intel SDC for Python 3.6 and 3.7 for Windows and Linux pla
 Intel SDC conda package can be installed using the steps below:
 ::
 
-    > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge
+    > conda create -n sdc_env python=<3.7 or 3.6> pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge
     > conda activate sdc_env
     > conda install sdc -c intel/label/beta -c intel -c defaults -c conda-forge --override-channels
 
 Intel SDC wheel package can be installed using the steps below:
 ::
 
-    > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=0.25.3 -c anaconda -c conda-forge
+    > conda create -n sdc_env python=<3.7 or 3.6> pip pyarrow=0.17.0 pandas=1.2.0 -c anaconda -c conda-forge
     > conda activate sdc_env
     > pip install --index-url https://pypi.anaconda.org/intel/label/beta/simple --extra-index-url https://pypi.anaconda.org/intel/simple --extra-index-url https://pypi.org/simple sdc
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy>=1.16
-pandas==0.25.3
+pandas==1.2.0
 pyarrow==0.17.0
-numba==0.51.2
+numba==0.52.0
 tbb
 tbb-devel
diff --git a/sdc/__init__.py b/sdc/__init__.py
@@ -28,7 +28,7 @@
 
 # re-export from Numba
 from numba import (typeof, prange, pndindex, gdb, gdb_breakpoint, gdb_init,
-                   stencil, threading_layer, jitclass, objmode)
+                   stencil, threading_layer, objmode)
 
 import sdc.config
 import sdc.set_ext
@@ -48,6 +48,7 @@
 import sdc.datatypes.series.init
 
 import sdc.extensions.indexes.range_index_ext
+import sdc.extensions.indexes.int64_index_ext
 
 from ._version import get_versions
 

diff --git a/sdc/_str_ext.cpp b/sdc/_str_ext.cpp
@@ -31,6 +31,7 @@
 #include <string>
 #include <vector>
 #include <cmath>
+#include <algorithm>
 
 #include "_str_decode.cpp"
 
@@ -129,6 +130,7 @@ extern "C"
     npy_intp array_size(PyArrayObject* arr);
     void* array_getptr1(PyArrayObject* arr, npy_intp ind);
     void array_setitem(PyArrayObject* arr, char* p, PyObject* s);
+    void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result);
 
     PyMODINIT_FUNC PyInit_hstr_ext(void)
     {
@@ -201,6 +203,7 @@ extern "C"
         PyObject_SetAttrString(m, "array_setitem", PyLong_FromVoidPtr((void*)(&array_setitem)));
         PyObject_SetAttrString(m, "decode_utf8", PyLong_FromVoidPtr((void*)(&decode_utf8)));
         PyObject_SetAttrString(m, "get_utf8_size", PyLong_FromVoidPtr((void*)(&get_utf8_size)));
+        PyObject_SetAttrString(m, "stable_argsort", PyLong_FromVoidPtr((void*)(&stable_argsort)));
         return m;
     }
 
@@ -871,4 +874,35 @@ extern "C"
         return;
     }
 
+    void stable_argsort(char* data_ptr, uint32_t* in_offsets, int64_t len, int8_t ascending, uint64_t* result)
+    {
+        using str_index_pair_type = std::pair<std::string, int64_t>;
+        std::vector<str_index_pair_type> str_arr_indexed;
+        str_arr_indexed.reserve(len);
+
+        for (int64_t i=0; i < len; ++i)
+        {
+            uint32_t start = in_offsets[i];
+            uint32_t size = in_offsets[i + 1] - in_offsets[i];
+            str_arr_indexed.emplace_back(
+                    std::move(std::string(&data_ptr[start], size)),
+                    i
+            );
+        }
+
+        std::stable_sort(str_arr_indexed.begin(),
+                         str_arr_indexed.end(),
+                         [=](const str_index_pair_type& left, const str_index_pair_type& right){
+                            if (ascending)
+                                return left.first < right.first;
+                            else
+                                return left.first > right.first;
+                         }
+        );
+
+        for (int64_t i=0; i < len; ++i)
+            result[i] = str_arr_indexed[i].second;
+    }
+
+
 } // extern "C"
diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py
@@ -48,14 +48,17 @@
 from sdc.functions import numpy_like
 from sdc.str_arr_type import string_array_type, StringArrayType
 from sdc.datatypes.range_index_type import RangeIndexType
+from sdc.datatypes.int64_index_type import Int64IndexType
 from sdc.str_arr_ext import (num_total_chars, append_string_array_to,
                              str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type,
                              cp_str_list_to_array, create_str_arr_from_list, get_utf8_size,
-                             str_arr_set_na_by_mask)
+                             str_arr_set_na_by_mask, str_arr_stable_argosort)
 from sdc.utilities.prange_utils import parallel_chunks
 from sdc.utilities.utils import sdc_overload, sdc_register_jitable
-from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes,
-                                            TypeChecker)
+from sdc.utilities.sdc_typing_utils import (
+                            find_common_dtype_from_numpy_dtypes,
+                            TypeChecker)
+from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types
 
 
 class SDCLimitation(Exception):
@@ -71,18 +74,20 @@ def hpat_arrays_append(A, B):
 def hpat_arrays_append_overload(A, B):
     """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A"""
 
-    A_is_range_index = isinstance(A, RangeIndexType)
-    B_is_range_index = isinstance(B, RangeIndexType)
-    if isinstance(A, (types.Array, RangeIndexType)):
-        if isinstance(B, (types.Array, RangeIndexType)):
+    use_A_array = isinstance(A, (RangeIndexType, Int64IndexType))
+    use_B_array = isinstance(B, (RangeIndexType, Int64IndexType))
+    if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)):
+        if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)):
             def _append_single_numeric_impl(A, B):
-                _A = A.values if A_is_range_index == True else A  # noqa
-                _B = B.values if B_is_range_index == True else B  # noqa
+                _A = A.values if use_A_array == True else A  # noqa
+                _B = B.values if use_B_array == True else B  # noqa
                 return numpy.concatenate((_A, _B,))
 
             return _append_single_numeric_impl
-        elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)):
-            B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType)
+
+        elif (isinstance(B, (types.UniTuple, types.List))
+              and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))):
+            B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType))
             numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], [])
 
             # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
@@ -92,10 +97,10 @@ def _append_list_numeric_impl(A, B):
                 new_data = numpy.empty(total_length, numba_common_dtype)
 
                 stop = len(A)
-                _A = numpy.array(A) if A_is_range_index == True else A  # noqa
+                _A = numpy.array(A) if use_A_array == True else A  # noqa
                 new_data[:stop] = _A
                 for arr in B:
-                    _arr = numpy.array(arr) if B_dtype_is_range_index == True else arr  # noqa
+                    _arr = arr.values if B_dtype_is_index == True else arr  # noqa
                     start = stop
                     stop = start + len(_arr)
                     new_data[start:stop] = _arr
@@ -218,12 +223,13 @@ def sdc_join_series_indexes_overload(left, right):
     """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm"""
 
     # check that both operands are of types used for representing Pandas indexes
-    if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType))
-            and isinstance(right, (types.Array, StringArrayType, RangeIndexType))):
+    if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types)
+            and not isinstance(left, types.NoneType)
+            and not isinstance(right, types.NoneType)):
         return None
 
-    convert_left = isinstance(left, RangeIndexType)
-    convert_right = isinstance(right, RangeIndexType)
+    convert_left = isinstance(left, (RangeIndexType, Int64IndexType))
+    convert_right = isinstance(right, (RangeIndexType, Int64IndexType))
 
     def _convert_to_arrays_impl(left, right):
         _left = left.values if convert_left == True else left  # noqa
@@ -243,10 +249,9 @@ def sdc_join_range_indexes_impl(left, right):
 
         return sdc_join_range_indexes_impl
 
-    elif isinstance(left, RangeIndexType) and isinstance(right, types.Array):
-        return _convert_to_arrays_impl
-
-    elif isinstance(left, types.Array) and isinstance(right, RangeIndexType):
+    elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array))
+          and isinstance(right, (RangeIndexType, Int64IndexType, types.Array))
+          and not (isinstance(left, types.Array) and isinstance(right, types.Array))):
         return _convert_to_arrays_impl
 
     # TODO: remove code duplication below and merge numeric and StringArray impls into one
@@ -513,41 +518,39 @@ def sdc_arrays_argsort(A, kind='quicksort'):
 
 
 @sdc_overload(sdc_arrays_argsort, jit_options={'parallel': False})
-def sdc_arrays_argsort_overload(A, kind='quicksort'):
+def sdc_arrays_argsort_overload(A, kind='quicksort', ascending=True):
     """Function providing pandas argsort implementation for different 1D array types"""
 
     # kind is not known at compile time, so get this function here and use in impl if needed
     quicksort_func = quicksort.make_jit_quicksort().run_quicksort
 
     kind_is_default = isinstance(kind, str)
     if isinstance(A, types.Array):
-        def _sdc_arrays_argsort_array_impl(A, kind='quicksort'):
+        def _sdc_arrays_argsort_array_impl(A, kind='quicksort', ascending=True):
             _kind = 'quicksort' if kind_is_default == True else kind  # noqa
-            return numpy_like.argsort(A, kind=_kind)
+            return numpy_like.argsort(A, kind=_kind, ascending=ascending)
 
         return _sdc_arrays_argsort_array_impl
 
     elif A == string_array_type:
-        def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort'):
+        def _sdc_arrays_argsort_str_arr_impl(A, kind='quicksort', ascending=True):
 
-            nan_mask = sdc.hiframes.api.get_nan_mask(A)
-            idx = numpy.arange(len(A))
-            old_nan_positions = idx[nan_mask]
-
-            data = A[~nan_mask]
-            keys = idx[~nan_mask]
             if kind == 'quicksort':
-                zipped = list(zip(list(data), list(keys)))
-                zipped = quicksort_func(zipped)
-                argsorted = [zipped[i][1] for i in numpy.arange(len(data))]
+                indexes = numpy.arange(len(A))
+                data_index_pairs = list(zip(list(A), list(indexes)))
+                zipped = quicksort_func(data_index_pairs)
+                argsorted = [zipped[i][1] for i in indexes]
+                res = numpy.array(argsorted, dtype=numpy.int64)
+                # for non-stable sort the order within groups does not matter
+                # so just reverse the result when sorting in descending order
+                if not ascending:
+                    res = res[::-1]
             elif kind == 'mergesort':
-                sdc.hiframes.sort.local_sort((data, ), (keys, ))
-                argsorted = list(keys)
+                res = str_arr_stable_argosort(A, ascending=ascending)
             else:
                 raise ValueError("Unrecognized kind of sort in sdc_arrays_argsort")
 
-            argsorted.extend(old_nan_positions)
-            return numpy.asarray(argsorted, dtype=numpy.int32)
+            return res
 
         return _sdc_arrays_argsort_str_arr_impl
 
@@ -618,13 +621,16 @@ def _sdc_take(data, indexes):
 @sdc_overload(_sdc_take)
 def _sdc_take_overload(data, indexes):
 
-    if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)):
+    valid_data_types = (types.Array,) + sdc_pandas_index_types
+    if not (isinstance(data, valid_data_types) and not isinstance(data, types.NoneType)):
         return None
-    if not (isinstance(indexes, (types.Array, types.List))
+
+    if not (isinstance(indexes, (types.Array, types.List, Int64IndexType))
             and isinstance(indexes.dtype, (types.Integer, types.ListType))):
         return None
 
-    if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)):
+    if (isinstance(indexes.dtype, types.ListType)
+            and isinstance(data, (types.Array, types.List, RangeIndexType, Int64IndexType))):
         arr_dtype = data.dtype
 
         def _sdc_take_list_impl(data, indexes):
@@ -677,7 +683,7 @@ def _sdc_take_list_str_impl(data, indexes):
 
         return _sdc_take_list_str_impl
 
-    elif isinstance(data, (types.Array, RangeIndexType)):
+    elif isinstance(data, (types.Array, RangeIndexType, Int64IndexType)):
         arr_dtype = data.dtype
 
         def _sdc_take_array_impl(data, indexes):
@@ -740,6 +746,7 @@ def sdc_reindex_series_overload(arr, index, name, by_index):
     """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
 
     range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType)
+    int64_indexes = isinstance(index, Int64IndexType) and isinstance(by_index, Int64IndexType)
     data_dtype, index_dtype = arr.dtype, index.dtype
     data_is_str_arr = isinstance(arr.dtype, types.UnicodeType)
 
@@ -748,6 +755,8 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
         # no reindexing is needed if indexes are equal
         if range_indexes == True:  # noqa
             equal_indexes = numpy_like.array_equal(index, by_index)
+        elif int64_indexes == True:  # noqa
+            equal_indexes = numpy_like.array_equal(index, by_index)
         else:
             equal_indexes = False
         if (index is by_index or equal_indexes):
@@ -772,10 +781,10 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
                 map_index_to_position[value] = i
 
         index_mismatch = 0
-        # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used
-        for i in numpy.arange(len(by_index)):
-            if by_index[i] in map_index_to_position:
-                pos_in_self = map_index_to_position[by_index[i]]
+        for i in numba.prange(len(by_index)):
+            val = by_index[i]
+            if val in map_index_to_position:
+                pos_in_self = map_index_to_position[val]
                 _res_data[i] = arr[pos_in_self]
                 if data_is_str_arr == True:  # noqa
                     res_data_nan_mask[i] = isna(arr, i)

diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -50,6 +50,7 @@
                                             gen_impl_generator, find_common_dtype_from_numpy_dtypes)
 from sdc.str_arr_ext import StringArrayType
 from sdc.datatypes.range_index_type import RangeIndexType
+from sdc.datatypes.int64_index_type import Int64IndexType
 
 from sdc.hiframes.pd_dataframe_type import DataFrameType
 from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
@@ -2257,7 +2258,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
 
     if accessor == 'at':
         num_idx = (isinstance(idx[0], types.Number)
-                   and isinstance(self.dataframe.index, (types.Array, types.NoneType, RangeIndexType)))
+                   and isinstance(self.dataframe.index, (types.NoneType, RangeIndexType, Int64IndexType)))
         str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral))
                    and isinstance(self.dataframe.index, StringArrayType))
         if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral):