Skip to content

Commit

Permalink
BUG: fixup GH pandas-dev#2751; make sure that we cast to platform num…
Browse files Browse the repository at this point in the history
…eric

     when a list is specified; use the Series codepath
     for initial list conversion (change from using DataFrame)
TST: added test for overflow in df creation
  • Loading branch information
jreback committed Feb 13, 2013
1 parent fc8de6d commit 3c345a1
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 17 deletions.
16 changes: 13 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -805,10 +805,11 @@ def _consensus_name_attr(objs):
# Lots of little utilities


def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True):
def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, convert_platform=False):
""" if we have an object dtype, try to coerce dates and/or numers """

if values.dtype == np.object_ and convert_dates:
# convert dates
if convert_dates and getattr(values,'dtype',None) == np.object_:

# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == 'coerce':
Expand All @@ -821,7 +822,8 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True):
else:
values = lib.maybe_convert_objects(values, convert_datetime=convert_dates)

if values.dtype == np.object_ and convert_numeric:
# convert to numeric
if convert_numeric and getattr(values,'dtype',None) == np.object_:
try:
new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True)

Expand All @@ -832,6 +834,14 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True):
except:
pass

# platform conversion
# allow ndarray or list here
if convert_platform:
if isinstance(values, (list,tuple)):
values = lib.list_to_object_array(values)
if values.dtype == np.object_:
values = lib.maybe_convert_objects(values)

return values


Expand Down
20 changes: 15 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5460,11 +5460,21 @@ def _prep_ndarray(values, copy=True):
if len(values) == 0:
return np.empty((0, 0), dtype=object)

arr = np.asarray(values)
# NumPy strings are a pain, convert to object
if issubclass(arr.dtype.type, basestring):
arr = np.array(values, dtype=object, copy=True)
values = arr
def convert(v):
return com._possibly_convert_objects(v,
convert_dates=False,
convert_numeric=False,
convert_platform=True)


# we could have a 1-dim or 2-dim list here
# this is equiv of np.asarray, but does object conversion
# and platform dtype preservation
if com.is_list_like(values[0]) or hasattr(values[0],'len'):
values = np.array([ convert(v) for v in values])
else:
values = convert(values)

else:
# drop subclass info, do not copy data
values = np.asarray(values)
Expand Down
12 changes: 8 additions & 4 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,11 +3111,15 @@ def _try_cast(arr):
raise
subarr = pa.array(data, dtype=object, copy=copy)
subarr = lib.maybe_convert_objects(subarr)
subarr = com._possibly_cast_to_datetime(subarr, dtype)

else:
subarr = lib.list_to_object_array(data)
subarr = lib.maybe_convert_objects(subarr)
subarr = com._possibly_cast_to_datetime(subarr, dtype)
subarr = com._possibly_convert_objects(data,
convert_dates=False,
convert_numeric=False,
convert_platform=True)

subarr = com._possibly_cast_to_datetime(subarr, dtype)

else:
subarr = _try_cast(data)

Expand Down
41 changes: 36 additions & 5 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8133,12 +8133,43 @@ def test_constructor_with_datetimes(self):
expected.sort()
assert_series_equal(result, expected)

# GH #2751 (construction with no index specified)
df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)] })
def test_constructor_for_list_with_dtypes(self):
intname = np.dtype(np.int_).name
floatname = np.dtype(np.float_).name
datetime64name = np.dtype('M8[ns]').name
objectname = np.dtype(np.object_).name

# test list of lists/ndarrays
df = DataFrame([np.arange(5) for x in range(5)])
result = df.get_dtype_counts()
expected = Series({'int64' : 5})

df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)])
result = df.get_dtype_counts()
expected = Series({'int32' : 5})

# overflow issue? (we always expecte int64 upcasting here)
df = DataFrame({'a' : [2**31,2**31+1]})
result = df.get_dtype_counts()
expected = Series({'int64' : 1 })
assert_series_equal(result, expected)

# GH #2751 (construction with no index specified), make sure we cast to platform values
df = DataFrame([1, 2])
result = df.get_dtype_counts()
expected = Series({'int64': 1 })
assert_series_equal(result, expected)

df = DataFrame({'a' : [1, 2]})
result = df.get_dtype_counts()
expected = Series({'int64': 1 })
assert_series_equal(result, expected)

df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3],
'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)],
'e' : [1.,2,4.,7]})
result = df.get_dtype_counts()
# TODO: fix this on 32-bit (or decide it's ok behavior?)
# expected = Series({intname: 1, floatname : 1, datetime64name: 1, objectname : 1})
expected = Series({'int64': 1, floatname : 1, datetime64name: 1, objectname : 1})
expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1})
result.sort()
expected.sort()
assert_series_equal(result, expected)
Expand Down

0 comments on commit 3c345a1

Please sign in to comment.