From 132d90dcf34fa7d2d7c5af978efc7e69238b805d Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 25 Jan 2013 08:45:00 -0500 Subject: [PATCH] BUG: various bug fixes for DataFrame/Series construction related to: 0 and 1 len ndarrays datetimes that are single objects mixed datetimes and objects (GH #2751) astype now converts correctly with a datetime64 type to object, NaT are converted to np.nan _get_numeric_data with empty mixed-type returning empty, but index was missing DOC: release notes updated, added missing_data section to docs, whatsnew 0.10.2 --- RELEASE.rst | 14 ++++++ doc/source/missing_data.rst | 17 ++++++++ doc/source/v0.10.2.txt | 54 +++++++++++++++++++++++ doc/source/whatsnew.rst | 2 + pandas/core/common.py | 14 ++++++ pandas/core/frame.py | 6 +-- pandas/core/series.py | 58 ++++++++++++++++++++----- pandas/src/inference.pyx | 11 +++++ pandas/tests/test_frame.py | 39 +++++++++++++++-- pandas/tests/test_series.py | 22 ++++++++-- pandas/tseries/tests/test_timeseries.py | 19 ++++---- pandas/tseries/tests/test_timezones.py | 2 +- pandas/tslib.pyx | 50 +++++++++++++-------- 13 files changed, 260 insertions(+), 48 deletions(-) create mode 100644 doc/source/v0.10.2.txt diff --git a/RELEASE.rst b/RELEASE.rst index 981fa5bed257d..73f55ca6c3e29 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -22,6 +22,20 @@ Where to get it * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +**API Changes** + + - Series now automatically will try to set the correct dtype based on passed datetimelike objects (datetime/Timestamp) + - timedelta64 are returned in appropriate cases (e.g. Series - Series, when both are datetime64) + - mixed datetimes and objects (GH2751_) in a constructor witll be casted correctly + - astype on datetimes to object are now handled (as well as NaT conversions to np.nan) + +**Bug fixes** + + - Single element ndarrays of datetimelike objects are handled (e.g. np.array(datetime(2001,1,1,0,0))), w/o dtype being passed + - 0-dim ndarrays with a passed dtype are handled correctly (e.g. np.array(0.,dtype='float32')) + +.. _GH2751: https://github.com/pydata/pandas/issues/2751 + pandas 0.10.1 ============= diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index b8f3468f82098..133d83513041e 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -80,6 +80,23 @@ pandas provides the :func:`~pandas.core.common.isnull` and missing by the ``isnull`` and ``notnull`` functions. ``inf`` and ``-inf`` are no longer considered missing by default. +Datetimes +--------- + +For datetime64[ns] types, ``NaT`` represents missing values. This is a pseudo-native +sentinal value that can be represented by numpy in a singular dtype (datetime64[ns]). +Pandas objects provide intercompatibility between ``NaT`` and ``NaN``. + +.. ipython:: python + + df2 = df.copy() + df2['timestamp'] = Timestamp('20120101') + df2 + df2.ix[['a','c','h'],['one','timestamp']] = np.nan + df2 + df2.get_dtype_counts() + + Calculations with missing data ------------------------------ diff --git a/doc/source/v0.10.2.txt b/doc/source/v0.10.2.txt new file mode 100644 index 0000000000000..cad3eccddd96e --- /dev/null +++ b/doc/source/v0.10.2.txt @@ -0,0 +1,54 @@ +.. _whatsnew_0102: + +v0.10.2 (February ??, 2013) +--------------------------- + +This is a minor release from 0.10.1 and includes many new features and +enhancements along with a large number of bug fixes. There are also a number of +important API changes that long-time pandas users should pay close attention +to. + +API changes +~~~~~~~~~~~ + +Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. Furthermore datetime64 columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) + +.. ipython:: python + + df = DataFrame(randn(6,2),date_range('20010102',periods=6),columns=['A','B']) + df['timestamp'] = Timestamp('20010103') + df + + # datetime64[ns] out of the box + df.get_dtype_counts() + + # use the traditional nan, which is mapped to NaT internally + df.ix[2:4,['A','timestamp']] = np.nan + df + +Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``np.nan`` + + +.. ipython:: python + + import datetime + s = Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)]) + s.dtype + s[1] = np.nan + s + s.dtype + s = s.astype('O') + s + s.dtype + +New features +~~~~~~~~~~~~ + +**Enhancements** + +**Bug Fixes** + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 6c125c45a2599..646610ecccd88 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -16,6 +16,8 @@ What's New These are new features and improvements of note in each release. +.. include:: v0.10.2.txt + .. include:: v0.10.1.txt .. include:: v0.10.0.txt diff --git a/pandas/core/common.py b/pandas/core/common.py index b3d996ffd0606..081267df86202 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -654,6 +654,20 @@ def _possibly_cast_to_datetime(value, dtype): except: pass + elif dtype is None: + # we might have a array (or single object) that is datetime like, and no dtype is passed + # don't change the value unless we find a datetime set + v = value + if not (is_list_like(v) or hasattr(v,'len')): + v = [ v ] + if len(v): + inferred_type = lib.infer_dtype(v) + if inferred_type == 'datetime': + try: + value = tslib.array_to_datetime(np.array(v)) + except: + pass + return value diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 301ea9d28d001..a863332619acb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4289,7 +4289,7 @@ def applymap(self, func): # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): - if x.dtype == 'M8[ns]': + if com.is_datetime64_dtype(x): x = lib.map_infer(x, lib.Timestamp) return lib.map_infer(x, func) return self.apply(infer) @@ -4980,7 +4980,7 @@ def _get_agg_axis(self, axis_num): def _get_numeric_data(self): if self._is_mixed_type: num_data = self._data.get_numeric_data() - return DataFrame(num_data, copy=False) + return DataFrame(num_data, index=self.index, copy=False) else: if (self.values.dtype != np.object_ and not issubclass(self.values.dtype.type, np.datetime64)): @@ -4991,7 +4991,7 @@ def _get_numeric_data(self): def _get_bool_data(self): if self._is_mixed_type: bool_data = self._data.get_bool_data() - return DataFrame(bool_data, copy=False) + return DataFrame(bool_data, index=self.index, copy=False) else: # pragma: no cover if self.values.dtype == np.bool_: return self diff --git a/pandas/core/series.py b/pandas/core/series.py index 06281e288021a..f91b1464aa4a7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -72,17 +72,28 @@ def na_op(x, y): def wrapper(self, other): from pandas.core.frame import DataFrame + dtype = None wrap_results = lambda x: x lvalues, rvalues = self, other - if (com.is_datetime64_dtype(self) and - com.is_datetime64_dtype(other)): + if com.is_datetime64_dtype(self): + + if not isinstance(rvalues, np.ndarray): + rvalues = np.array([rvalues]) + + # rhs is either a timedelta or a series/ndarray + if lib.is_timedelta_array(rvalues): + rvalues = np.array([ np.timedelta64(v) for v in rvalues ],dtype='timedelta64[ns]') + dtype = 'M8[ns]' + elif com.is_datetime64_dtype(rvalues): + dtype = 'timedelta64[ns]' + else: + raise ValueError("cannot operate on a series with out a rhs of a series/ndarray of type datetime64[ns] or a timedelta") + lvalues = lvalues.view('i8') rvalues = rvalues.view('i8') - wrap_results = lambda rs: rs.astype('timedelta64[ns]') - if isinstance(rvalues, Series): lvalues = lvalues.values rvalues = rvalues.values @@ -91,7 +102,7 @@ def wrapper(self, other): if self.index.equals(other.index): name = _maybe_match_name(self, other) return Series(wrap_results(na_op(lvalues, rvalues)), - index=self.index, name=name) + index=self.index, name=name, dtype=dtype) join_idx, lidx, ridx = self.index.join(other.index, how='outer', return_indexers=True) @@ -105,13 +116,13 @@ def wrapper(self, other): arr = na_op(lvalues, rvalues) name = _maybe_match_name(self, other) - return Series(arr, index=join_idx, name=name) + return Series(arr, index=join_idx, name=name,dtype=dtype) elif isinstance(other, DataFrame): return NotImplemented else: # scalars return Series(na_op(lvalues.values, rvalues), - index=self.index, name=self.name) + index=self.index, name=self.name, dtype=dtype) return wrapper @@ -777,7 +788,7 @@ def astype(self, dtype): See numpy.ndarray.astype """ casted = com._astype_nansafe(self.values, dtype) - return self._constructor(casted, index=self.index, name=self.name) + return self._constructor(casted, index=self.index, name=self.name, dtype=casted.dtype) def convert_objects(self, convert_dates=True): """ @@ -1195,7 +1206,7 @@ def tolist(self): Overrides numpy.ndarray.tolist """ if com.is_datetime64_dtype(self): - return self.astype(object).values.tolist() + return list(self) return self.values.tolist() def to_dict(self): @@ -3083,8 +3094,12 @@ def _try_cast(arr): raise TypeError('Cannot cast datetime64 to %s' % dtype) else: subarr = _try_cast(data) - elif copy: + else: + subarr = _try_cast(data) + + if copy: subarr = data.copy() + elif isinstance(data, list) and len(data) > 0: if dtype is not None: try: @@ -3094,12 +3109,15 @@ def _try_cast(arr): raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) + subarr = com._possibly_cast_to_datetime(subarr, dtype) else: subarr = lib.list_to_object_array(data) subarr = lib.maybe_convert_objects(subarr) + subarr = com._possibly_cast_to_datetime(subarr, dtype) else: subarr = _try_cast(data) + # scalar like if subarr.ndim == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) @@ -3115,7 +3133,14 @@ def _try_cast(arr): dtype = np.object_ if dtype is None: - value, dtype = _dtype_from_scalar(value) + + # a 1-element ndarray + if isinstance(value, np.ndarray): + dtype = value.dtype + value = value.item() + else: + value, dtype = _dtype_from_scalar(value) + subarr = np.empty(len(index), dtype=dtype) else: # need to possibly convert the value here @@ -3124,6 +3149,17 @@ def _try_cast(arr): subarr.fill(value) else: return subarr.item() + + # the result that we want + elif subarr.ndim == 1: + if index is not None: + + # a 1-element ndarray + if len(subarr) != len(index) and len(subarr) == 1: + value = subarr[0] + subarr = np.empty(len(index), dtype=subarr.dtype) + subarr.fill(value) + elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 41ac1b3f3480f..23e7b26afaad2 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -265,6 +265,17 @@ def is_datetime64_array(ndarray values): return False return True +def is_timedelta_array(ndarray values): + import datetime + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not isinstance(values[i],datetime.timedelta): + return False + return True + + def is_date_array(ndarray[object] values): cdef int i, n = len(values) if n == 0: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 09747ba3f09f0..f32ba325ca3f5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -47,7 +47,6 @@ def _skip_if_no_scipy(): JOIN_TYPES = ['inner', 'outer', 'left', 'right'] - class CheckIndexing(object): _multiprocess_can_split_ = True @@ -6484,14 +6483,18 @@ def test_get_X_columns(self): ['a', 'e'])) def test_get_numeric_data(self): - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo'}, + + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f' : Timestamp('20010102')}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 1, 'datetime64[ns]': 1, 'object' : 1}) + assert_series_equal(result, expected) result = df._get_numeric_data() expected = df.ix[:, ['a', 'b']] assert_frame_equal(result, expected) - only_obj = df.ix[:, ['c']] + only_obj = df.ix[:, ['c','f']] result = only_obj._get_numeric_data() expected = df.ix[:, []] assert_frame_equal(result, expected) @@ -7367,6 +7370,36 @@ def test_as_matrix_numeric_cols(self): values = self.frame.as_matrix(['A', 'B', 'C', 'D']) self.assert_(values.dtype == np.float64) + + def test_constructor_with_datetimes(self): + + # single item + df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, + index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'datetime64[ns]': 2, 'object' : 2}) + assert_series_equal(result, expected) + + # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float64' : np.array(1.,dtype='float64'), + 'int64' : np.array(1,dtype='int64')}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 2, 'float64' : 2, 'object' : 1}) + assert_series_equal(result, expected) + + # check with ndarray construction ndim>0 + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float64' : np.array([1.]*10,dtype='float64'), + 'int64' : np.array([1]*10,dtype='int64')}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 2, 'float64' : 2, 'object' : 1}) + assert_series_equal(result, expected) + + # GH #2751 (construction with no index specified) + df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)] }) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 1, 'datetime64[ns]': 1, 'object' : 1}) + assert_series_equal(result, expected) + def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 896c7dc34901f..c0f40269ab24c 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -520,6 +520,7 @@ def test_array_finalize(self): pass def test_fromValue(self): + nans = Series(np.NaN, index=self.ts.index) self.assert_(nans.dtype == np.float_) self.assertEqual(len(nans), len(self.ts)) @@ -530,7 +531,7 @@ def test_fromValue(self): d = datetime.now() dates = Series(d, index=self.ts.index) - self.assert_(dates.dtype == np.object_) + self.assert_(dates.dtype == 'M8[ns]') self.assertEqual(len(dates), len(self.ts)) def test_contains(self): @@ -2295,8 +2296,6 @@ def test_tolist(self): # datetime64 s = Series(self.ts.index) rs = s.tolist() - xp = s.astype(object).values.tolist() - assert_almost_equal(rs, xp) self.assertEqual(self.ts.index[0], rs[0]) def test_to_dict(self): @@ -2620,6 +2619,23 @@ def test_astype_cast_object_int(self): result = arr.astype(int) self.assert_(np.array_equal(result, np.arange(1, 5))) + def test_astype_datetimes(self): + import pandas.tslib as tslib + + s = Series(tslib.iNaT, dtype='M8[ns]', index=range(5)) + s = s.astype('O') + self.assert_(s.dtype == np.object_) + + s = Series([datetime(2001, 1, 2, 0, 0)]) + s = s.astype('O') + self.assert_(s.dtype == np.object_) + + s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) + s[1] = np.nan + self.assert_(s.dtype == 'M8[ns]') + s = s.astype('O') + self.assert_(s.dtype == np.object_) + def test_map(self): index, data = tm.getMixedTypeDict() diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index aa12d6142d6d8..134e272acf78e 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -1254,9 +1254,6 @@ def test_append_concat(self): def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) - # self.assert_(x[0].dtype == object) - - # x[0] = to_datetime(x[0]) self.assert_(x[0].dtype == np.dtype('M8[ns]')) def test_groupby_count_dateparseerror(self): @@ -2391,7 +2388,7 @@ def setUp(self): def test_auto_conversion(self): series = Series(list(date_range('1/1/2000', periods=10))) - self.assert_(series.dtype == object) + self.assert_(series.dtype == 'M8[ns]') def test_constructor_cant_cast_datetime64(self): self.assertRaises(TypeError, Series, @@ -2441,13 +2438,17 @@ def test_set_none_nan(self): self.assert_(self.series[6] is NaT) def test_intercept_astype_object(self): + + # this test no longer makes sense as series is by default already M8[ns] + # Work around NumPy 1.6 bugs - result = self.series.astype(object) - result2 = self.series.astype('O') - expected = Series([x for x in self.series], dtype=object) + #result = self.series.astype(object) + #result2 = self.series.astype('O') + + expected = Series(self.series, dtype=object) - assert_series_equal(result, expected) - assert_series_equal(result2, expected) + #assert_series_equal(result, expected) + #assert_series_equal(result2, expected) df = DataFrame({'a': self.series, 'b': np.random.randn(len(self.series))}) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index fe692350023f1..adf0d630dc8b0 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -494,7 +494,7 @@ def test_frame_no_datetime64_dtype(self): dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') dr_tz = dr.tz_localize('US/Eastern') e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) - self.assert_(e['B'].dtype == object) + self.assert_(e['B'].dtype == 'M8[ns]') def test_hongkong_tz_convert(self): # #1673 diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index bbbe090225b83..1bcf4ad9ea6a5 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -54,33 +54,47 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): if tz is not None: if _is_utc(tz): for i in range(n): - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) + if arr[i] == iNaT: + result[i] = np.nan + else: + pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - dt = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, tz) - result[i] = dt + tz.utcoffset(dt) + if arr[i] == iNaT: + result[i] = np.nan + else: + pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) + dt = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + result[i] = dt + tz.utcoffset(dt) else: trans = _get_transitions(tz) deltas = _get_deltas(tz) for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - pos = trans.searchsorted(arr[i]) - 1 - inf = tz._transition_info[pos] - pandas_datetime_to_datetimestruct(arr[i] + deltas[pos], - PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us, - tz._tzinfos[inf]) + if arr[i] == iNaT: + result[i] = np.nan + else: + + # Adjust datetime64 timestamp, recompute datetimestruct + pos = trans.searchsorted(arr[i]) - 1 + inf = tz._transition_info[pos] + + pandas_datetime_to_datetimestruct(arr[i] + deltas[pos], + PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, + tz._tzinfos[inf]) else: for i in range(n): - pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) - result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, - dts.min, dts.sec, dts.us) + if arr[i] == iNaT: + result[i] = np.nan + else: + pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us) return result