Skip to content

Commit

Permalink
PERF: construct DataFrame with string array and dtype=str (pandas-dev…
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and Kevin D Smith committed Nov 2, 2020
1 parent f73fe1e commit f45a8ec
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 15 deletions.
17 changes: 12 additions & 5 deletions asv_bench/benchmarks/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,20 @@ class Construction:
param_names = ["dtype"]

def setup(self, dtype):
self.data = tm.rands_array(nchars=10 ** 5, size=10)
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()

def time_construction(self, dtype):
Series(self.data, dtype=dtype)
def time_series_construction(self, dtype):
Series(self.series_arr, dtype=dtype)

def peakmem_construction(self, dtype):
Series(self.data, dtype=dtype)
def peakmem_series_construction(self, dtype):
Series(self.series_arr, dtype=dtype)

def time_frame_construction(self, dtype):
DataFrame(self.frame_arr, dtype=dtype)

def peakmem_frame_construction(self, dtype):
DataFrame(self.frame_arr, dtype=dtype)


class Methods:
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ Deprecations
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`)
- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
Expand Down
20 changes: 11 additions & 9 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
construct_1d_ndarray_preserving_na,
maybe_cast_to_datetime,
maybe_convert_platform,
maybe_infer_to_datetimelike,
Expand Down Expand Up @@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
# the dtypes will be coerced to a single dtype
values = _prep_ndarray(values, copy=copy)

if dtype is not None:
if not is_dtype_equal(values.dtype, dtype):
try:
values = values.astype(dtype)
except Exception as orig:
# e.g. ValueError when trying to cast object dtype to float64
raise ValueError(
f"failed to cast to '{dtype}' (Exception was: {orig})"
) from orig
if dtype is not None and not is_dtype_equal(values.dtype, dtype):
try:
values = construct_1d_ndarray_preserving_na(
values.ravel(), dtype=dtype, copy=False
).reshape(values.shape)
except Exception as orig:
# e.g. ValueError when trying to cast object dtype to float64
raise ValueError(
f"failed to cast to '{dtype}' (Exception was: {orig})"
) from orig

# _prep_ndarray ensures that values.ndim == 2 at this point
index, columns = _get_axes(
Expand Down

0 comments on commit f45a8ec

Please sign in to comment.