BUG: various bug fixes for DataFrame/Series construction related to:

0 and 1 len ndarrays datetimes that are single objects mixed datetimes and objects (GH pandas-dev#2751) astype now converts correctly with a datetime64 type to object, NaT are converted to np.nan _get_numeric_data with empty mixed-type returning empty, but index was missing DOC: release notes updated, added missing_data section to docs, whatsnew 0.10.2
jreback · Jan 31, 2013 · 132d90d · 132d90d
1 parent 3ba3119
commit 132d90d
Show file tree

Hide file tree

Showing 13 changed files with 260 additions and 48 deletions.
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -22,6 +22,20 @@ Where to get it
 * Binary installers on PyPI: http://pypi.python.org/pypi/pandas
 * Documentation: http://pandas.pydata.org
 
+**API Changes**
+
+  - Series now automatically will try to set the correct dtype based on passed datetimelike objects (datetime/Timestamp)
+     - timedelta64 are returned in appropriate cases (e.g. Series - Series, when both are datetime64)
+     - mixed datetimes and objects (GH2751_) in a constructor witll be casted correctly
+     - astype on datetimes to object are now handled (as well as NaT conversions to np.nan)
+
+**Bug fixes**
+
+  - Single element ndarrays of datetimelike objects are handled (e.g. np.array(datetime(2001,1,1,0,0))), w/o dtype being passed
+  - 0-dim ndarrays with a passed dtype are handled correctly (e.g. np.array(0.,dtype='float32'))
+
+.. _GH2751: https://github.com/pydata/pandas/issues/2751
+
 pandas 0.10.1
 =============
 

diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -80,6 +80,23 @@ pandas provides the :func:`~pandas.core.common.isnull` and
 missing by the ``isnull`` and ``notnull`` functions. ``inf`` and
 ``-inf`` are no longer considered missing by default.
 
+Datetimes
+---------
+
+For datetime64[ns] types, ``NaT`` represents missing values. This is a pseudo-native
+sentinal value that can be represented by numpy in a singular dtype (datetime64[ns]).
+Pandas objects provide intercompatibility between ``NaT`` and ``NaN``.
+
+.. ipython:: python
+
+   df2 = df.copy()
+   df2['timestamp'] = Timestamp('20120101')
+   df2
+   df2.ix[['a','c','h'],['one','timestamp']] = np.nan
+   df2
+   df2.get_dtype_counts()
+
+
 Calculations with missing data
 ------------------------------
 

diff --git a/doc/source/v0.10.2.txt b/doc/source/v0.10.2.txt
@@ -0,0 +1,54 @@
+.. _whatsnew_0102:
+
+v0.10.2 (February ??, 2013)
+---------------------------
+
+This is a minor release from 0.10.1 and includes many new features and
+enhancements along with a large number of bug fixes. There are also a number of
+important API changes that long-time pandas users should pay close attention
+to.
+
+API changes
+~~~~~~~~~~~
+
+Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. Furthermore datetime64 columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*)
+
+.. ipython:: python
+
+   df = DataFrame(randn(6,2),date_range('20010102',periods=6),columns=['A','B'])
+   df['timestamp'] = Timestamp('20010103')
+   df
+
+   # datetime64[ns] out of the box
+   df.get_dtype_counts()
+
+   # use the traditional nan, which is mapped to NaT internally
+   df.ix[2:4,['A','timestamp']] = np.nan
+   df
+
+Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``np.nan``
+
+
+.. ipython:: python
+
+   import datetime
+   s = Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)])
+   s.dtype
+   s[1] = np.nan
+   s
+   s.dtype
+   s = s.astype('O')
+   s
+   s.dtype
+
+New features
+~~~~~~~~~~~~
+
+**Enhancements**
+
+**Bug Fixes**
+
+See the `full release notes
+<https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
+on GitHub for a complete list.
+
diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst
@@ -16,6 +16,8 @@ What's New
 
 These are new features and improvements of note in each release.
 
+.. include:: v0.10.2.txt
+
 .. include:: v0.10.1.txt
 
 .. include:: v0.10.0.txt

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -654,6 +654,20 @@ def _possibly_cast_to_datetime(value, dtype):
                 except:
                     pass
 
+    elif dtype is None:
+        # we might have a array (or single object) that is datetime like, and no dtype is passed
+        # don't change the value unless we find a datetime set
+        v = value
+        if not (is_list_like(v) or hasattr(v,'len')):
+            v = [ v ]
+        if len(v):
+            inferred_type = lib.infer_dtype(v)
+            if inferred_type == 'datetime':
+                try:
+                    value = tslib.array_to_datetime(np.array(v))
+                except:
+                    pass
+
     return value
 
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4289,7 +4289,7 @@ def applymap(self, func):
 
         # if we have a dtype == 'M8[ns]', provide boxed values
         def infer(x):
-            if x.dtype == 'M8[ns]':
+            if com.is_datetime64_dtype(x):
                 x = lib.map_infer(x, lib.Timestamp)
             return lib.map_infer(x, func)
         return self.apply(infer)
@@ -4980,7 +4980,7 @@ def _get_agg_axis(self, axis_num):
     def _get_numeric_data(self):
         if self._is_mixed_type:
             num_data = self._data.get_numeric_data()
-            return DataFrame(num_data, copy=False)
+            return DataFrame(num_data, index=self.index, copy=False)
         else:
             if (self.values.dtype != np.object_ and
                     not issubclass(self.values.dtype.type, np.datetime64)):
@@ -4991,7 +4991,7 @@ def _get_numeric_data(self):
     def _get_bool_data(self):
         if self._is_mixed_type:
             bool_data = self._data.get_bool_data()
-            return DataFrame(bool_data, copy=False)
+            return DataFrame(bool_data, index=self.index, copy=False)
         else:  # pragma: no cover
             if self.values.dtype == np.bool_:
                 return self

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -72,17 +72,28 @@ def na_op(x, y):
 
     def wrapper(self, other):
         from pandas.core.frame import DataFrame
+        dtype = None
         wrap_results = lambda x: x
 
         lvalues, rvalues = self, other
 
-        if (com.is_datetime64_dtype(self) and
-            com.is_datetime64_dtype(other)):
+        if com.is_datetime64_dtype(self):
+
+            if not isinstance(rvalues, np.ndarray):
+                rvalues = np.array([rvalues])
+
+            # rhs is either a timedelta or a series/ndarray
+            if lib.is_timedelta_array(rvalues):
+                rvalues = np.array([ np.timedelta64(v) for v in rvalues ],dtype='timedelta64[ns]')
+                dtype = 'M8[ns]'
+            elif com.is_datetime64_dtype(rvalues):
+                dtype = 'timedelta64[ns]'
+            else:
+                raise ValueError("cannot operate on a series with out a rhs of a series/ndarray of type datetime64[ns] or a timedelta")
+
             lvalues = lvalues.view('i8')
             rvalues = rvalues.view('i8')
 
-            wrap_results = lambda rs: rs.astype('timedelta64[ns]')
-
         if isinstance(rvalues, Series):
             lvalues = lvalues.values
             rvalues = rvalues.values
@@ -91,7 +102,7 @@ def wrapper(self, other):
             if self.index.equals(other.index):
                 name = _maybe_match_name(self, other)
                 return Series(wrap_results(na_op(lvalues, rvalues)),
-                              index=self.index, name=name)
+                              index=self.index, name=name, dtype=dtype)
 
             join_idx, lidx, ridx = self.index.join(other.index, how='outer',
                                                    return_indexers=True)
@@ -105,13 +116,13 @@ def wrapper(self, other):
             arr = na_op(lvalues, rvalues)
 
             name = _maybe_match_name(self, other)
-            return Series(arr, index=join_idx, name=name)
+            return Series(arr, index=join_idx, name=name,dtype=dtype)
         elif isinstance(other, DataFrame):
             return NotImplemented
         else:
             # scalars
             return Series(na_op(lvalues.values, rvalues),
-                          index=self.index, name=self.name)
+                          index=self.index, name=self.name, dtype=dtype)
     return wrapper
 
 
@@ -777,7 +788,7 @@ def astype(self, dtype):
         See numpy.ndarray.astype
         """
         casted = com._astype_nansafe(self.values, dtype)
-        return self._constructor(casted, index=self.index, name=self.name)
+        return self._constructor(casted, index=self.index, name=self.name, dtype=casted.dtype)
 
     def convert_objects(self, convert_dates=True):
         """
@@ -1195,7 +1206,7 @@ def tolist(self):
         Overrides numpy.ndarray.tolist
         """
         if com.is_datetime64_dtype(self):
-            return self.astype(object).values.tolist()
+            return list(self)
         return self.values.tolist()
 
     def to_dict(self):
@@ -3083,8 +3094,12 @@ def _try_cast(arr):
                         raise TypeError('Cannot cast datetime64 to %s' % dtype)
                 else:
                     subarr = _try_cast(data)
-        elif copy:
+        else:
+            subarr = _try_cast(data)
+
+        if copy:
             subarr = data.copy()
+
     elif isinstance(data, list) and len(data) > 0:
         if dtype is not None:
             try:
@@ -3094,12 +3109,15 @@ def _try_cast(arr):
                     raise
                 subarr = np.array(data, dtype=object, copy=copy)
                 subarr = lib.maybe_convert_objects(subarr)
+                subarr = com._possibly_cast_to_datetime(subarr, dtype)
         else:
             subarr = lib.list_to_object_array(data)
             subarr = lib.maybe_convert_objects(subarr)
+            subarr = com._possibly_cast_to_datetime(subarr, dtype)
     else:
         subarr = _try_cast(data)
 
+    # scalar like
     if subarr.ndim == 0:
         if isinstance(data, list):  # pragma: no cover
             subarr = np.array(data, dtype=object)
@@ -3115,7 +3133,14 @@ def _try_cast(arr):
                 dtype = np.object_
 
             if dtype is None:
-                value, dtype = _dtype_from_scalar(value)
+
+                # a 1-element ndarray
+                if isinstance(value, np.ndarray):
+                    dtype = value.dtype
+                    value = value.item()
+                else:
+                    value, dtype = _dtype_from_scalar(value)
+
                 subarr = np.empty(len(index), dtype=dtype)
             else:
                 # need to possibly convert the value here
@@ -3124,6 +3149,17 @@ def _try_cast(arr):
             subarr.fill(value)
         else:
             return subarr.item()
+
+    # the result that we want
+    elif subarr.ndim == 1:
+        if index is not None:
+
+            # a 1-element ndarray
+            if len(subarr) != len(index) and len(subarr) == 1:
+                value = subarr[0]
+                subarr = np.empty(len(index), dtype=subarr.dtype)
+                subarr.fill(value)
+
     elif subarr.ndim > 1:
         if isinstance(data, np.ndarray):
             raise Exception('Data must be 1-dimensional')

diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx
@@ -265,6 +265,17 @@ def is_datetime64_array(ndarray values):
             return False
     return True
 
+def is_timedelta_array(ndarray values):
+    import datetime
+    cdef int i, n = len(values)
+    if n == 0:
+        return False
+    for i in range(n):
+        if not isinstance(values[i],datetime.timedelta):
+            return False
+    return True
+
+
 def is_date_array(ndarray[object] values):
     cdef int i, n = len(values)
     if n == 0:

diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -47,7 +47,6 @@ def _skip_if_no_scipy():
 
 JOIN_TYPES = ['inner', 'outer', 'left', 'right']
 
-
 class CheckIndexing(object):
 
     _multiprocess_can_split_ = True
@@ -6484,14 +6483,18 @@ def test_get_X_columns(self):
                                     ['a', 'e']))
 
     def test_get_numeric_data(self):
-        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo'},
+
+        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f' : Timestamp('20010102')},
                        index=np.arange(10))
+        result = df.get_dtype_counts()
+        expected = Series({'int64': 1, 'float64' : 1, 'datetime64[ns]': 1, 'object' : 1})
+        assert_series_equal(result, expected)
 
         result = df._get_numeric_data()
         expected = df.ix[:, ['a', 'b']]
         assert_frame_equal(result, expected)
 
-        only_obj = df.ix[:, ['c']]
+        only_obj = df.ix[:, ['c','f']]
         result = only_obj._get_numeric_data()
         expected = df.ix[:, []]
         assert_frame_equal(result, expected)
@@ -7367,6 +7370,36 @@ def test_as_matrix_numeric_cols(self):
         values = self.frame.as_matrix(['A', 'B', 'C', 'D'])
         self.assert_(values.dtype == np.float64)
 
+
+    def test_constructor_with_datetimes(self):
+
+        # single item
+        df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) },
+                       index=np.arange(10))
+        result = df.get_dtype_counts()
+        expected = Series({'int64': 1, 'datetime64[ns]': 2, 'object' : 2})
+        assert_series_equal(result, expected)
+
+        # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified)
+        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float64' : np.array(1.,dtype='float64'), 
+                        'int64' : np.array(1,dtype='int64')}, index=np.arange(10))
+        result = df.get_dtype_counts()
+        expected = Series({'int64': 2, 'float64' : 2, 'object' : 1})
+        assert_series_equal(result, expected)
+
+        # check with ndarray construction ndim>0
+        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'float64' : np.array([1.]*10,dtype='float64'), 
+                        'int64' : np.array([1]*10,dtype='int64')}, index=np.arange(10))
+        result = df.get_dtype_counts()
+        expected = Series({'int64': 2, 'float64' : 2, 'object' : 1})
+        assert_series_equal(result, expected)
+
+        # GH #2751 (construction with no index specified)
+        df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)] })
+        result = df.get_dtype_counts()
+        expected = Series({'int64': 1, 'float64' : 1, 'datetime64[ns]': 1, 'object' : 1})
+        assert_series_equal(result, expected)
+
     def test_constructor_frame_copy(self):
         cop = DataFrame(self.frame, copy=True)
         cop['A'] = 5