Merge remote-tracking branch 'upstream/master' into disown-tz-only-re…

…based
jbrockmendel · Dec 28, 2018 · 68cde94 · 68cde94
2 parents 4d3b55e + c1af4f5
commit 68cde94
Show file tree

Hide file tree

Showing 21 changed files with 380 additions and 109 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4989,6 +4989,54 @@ with respect to the timezone.
 timezone aware or naive. When reading ``TIMESTAMP WITH TIME ZONE`` types, pandas
 will convert the data to UTC.
 
+.. _io.sql.method:
+
+Insertion Method
+++++++++++++++++
+
+.. versionadded:: 0.24.0
+
+The parameter ``method`` controls the SQL insertion clause used.
+Possible values are:
+
+- ``None``: Uses standard SQL ``INSERT`` clause (one per row).
+- ``'multi'``: Pass multiple values in a single ``INSERT`` clause.
+  It uses a *special* SQL syntax not supported by all backends.
+  This usually provides better performance for analytic databases
+  like *Presto* and *Redshift*, but has worse performance for
+  traditional SQL backend if the table contains many columns.
+  For more information check the SQLAlchemy `documention
+  <http://docs.sqlalchemy.org/en/latest/core/dml.html#sqlalchemy.sql.expression.Insert.values.params.*args>`__.
+- callable with signature ``(pd_table, conn, keys, data_iter)``:
+  This can be used to implement a more performant insertion method based on
+  specific backend dialect features.
+
+Example of a callable using PostgreSQL `COPY clause
+<https://www.postgresql.org/docs/current/static/sql-copy.html>`__::
+
+  # Alternative to_sql() *method* for DBs that support COPY FROM
+  import csv
+  from io import StringIO
+
+  def psql_insert_copy(table, conn, keys, data_iter):
+      # gets a DBAPI connection that can provide a cursor
+      dbapi_conn = conn.connection
+      with dbapi_conn.cursor() as cur:
+          s_buf = StringIO()
+          writer = csv.writer(s_buf)
+          writer.writerows(data_iter)
+          s_buf.seek(0)
+
+          columns = ', '.join('"{}"'.format(k) for k in keys)
+          if table.schema:
+              table_name = '{}.{}'.format(table.schema, table.name)
+          else:
+              table_name = table.name
+
+          sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
+              table_name, columns)
+          cur.copy_expert(sql=sql, file=s_buf)
+
 Reading Tables
 ''''''''''''''
 

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -377,6 +377,7 @@ Other Enhancements
 - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`)
 - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`)
 - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
+- :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method <io.sql.method>` section in the documentation. (:issue:`8953`)
 
 .. _whatsnew_0240.api_breaking:
 
@@ -1356,6 +1357,7 @@ Datetimelike
 - Bug in :func:`to_datetime` where ``box`` and ``utc`` arguments were ignored when passing a :class:`DataFrame` or ``dict`` of unit mappings (:issue:`23760`)
 - Bug in :attr:`Series.dt` where the cache would not update properly after an in-place operation (:issue:`24408`)
 - Bug in :class:`PeriodIndex` where comparisons against an array-like object with length 1 failed to raise ``ValueError`` (:issue:`23078`)
+- Bug in :meth:`DatetimeIndex.astype`, :meth:`PeriodIndex.astype` and :meth:`TimedeltaIndex.astype` ignoring the sign of the ``dtype`` for unsigned integer dtypes (:issue:`24405`).
 
 Timedelta
 ^^^^^^^^^

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -23,7 +23,8 @@
     is_datetime64_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype,
     is_dtype_equal, is_extension_array_dtype, is_float_dtype, is_integer_dtype,
     is_list_like, is_object_dtype, is_offsetlike, is_period_dtype,
-    is_string_dtype, is_timedelta64_dtype, needs_i8_conversion, pandas_dtype)
+    is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype,
+    needs_i8_conversion, pandas_dtype)
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
 from pandas.core.dtypes.inference import is_array_like
@@ -397,7 +398,7 @@ def _ndarray_values(self):
     # ----------------------------------------------------------------
     # Rendering Methods
 
-    def _format_native_types(self, na_rep=u'NaT', date_format=None):
+    def _format_native_types(self, na_rep='NaT', date_format=None):
         """
         Helper method for astype when converting to strings.
 
@@ -598,6 +599,11 @@ def astype(self, dtype, copy=True):
             # we deliberately ignore int32 vs. int64 here.
             # See https://github.com/pandas-dev/pandas/issues/24381 for more.
             values = self.asi8
+
+            if is_unsigned_integer_dtype(dtype):
+                # Again, we ignore int32 vs. int64
+                values = values.view("uint64")
+
             if copy:
                 values = values.copy()
             return values
@@ -612,6 +618,28 @@ def astype(self, dtype, copy=True):
         else:
             return np.asarray(self, dtype=dtype)
 
+    def view(self, dtype=None):
+        """
+        New view on this array with the same data.
+
+        Parameters
+        ----------
+        dtype : numpy dtype, optional
+
+        Returns
+        -------
+        ndarray
+            With the specified `dtype`.
+        """
+        return self._data.view(dtype=dtype)
+
+    # ------------------------------------------------------------------
+    # ExtensionArray Interface
+    # TODO:
+    #   * _from_sequence
+    #   * argsort / _values_for_argsort
+    #   * _reduce
+
     def unique(self):
         result = unique1d(self.asi8)
         return type(self)(result, dtype=self.dtype)
@@ -674,21 +702,6 @@ def _values_for_argsort(self):
     # These are not part of the EA API, but we implement them because
     # pandas currently assumes they're there.
 
-    def view(self, dtype=None):
-        """
-        New view on this array with the same data.
-
-        Parameters
-        ----------
-        dtype : numpy dtype, optional
-
-        Returns
-        -------
-        ndarray
-            With the specified `dtype`.
-        """
-        return self._data.view(dtype=dtype)
-
     def value_counts(self, dropna=False):
         """
         Return a Series containing counts of unique values.

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -563,6 +563,35 @@ def __iter__(self):
             for v in converted:
                 yield v
 
+    def astype(self, dtype, copy=True):
+        # We handle
+        #   --> datetime
+        #   --> period
+        # DatetimeLikeArrayMixin Super handles the rest.
+        dtype = pandas_dtype(dtype)
+
+        if (is_datetime64_ns_dtype(dtype) and
+                not is_dtype_equal(dtype, self.dtype)):
+            # GH#18951: datetime64_ns dtype but not equal means different tz
+            new_tz = getattr(dtype, 'tz', None)
+            if getattr(self.dtype, 'tz', None) is None:
+                return self.tz_localize(new_tz)
+            result = self.tz_convert(new_tz)
+            if new_tz is None:
+                # Do we want .astype('datetime64[ns]') to be an ndarray.
+                # The astype in Block._astype expects this to return an
+                # ndarray, but we could maybe work around it there.
+                result = result._data
+            return result
+        elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype,
+                                                                  dtype):
+            if copy:
+                return self.copy()
+            return self
+        elif is_period_dtype(dtype):
+            return self.to_period(freq=dtype.freq)
+        return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy)
+
     # ----------------------------------------------------------------
     # ExtensionArray Interface
 
@@ -581,7 +610,7 @@ def _validate_fill_value(self, fill_value):
     # -----------------------------------------------------------------
     # Rendering Methods
 
-    def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs):
+    def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs):
         from pandas.io.formats.format import _get_format_datetime64_from_values
         fmt = _get_format_datetime64_from_values(self, date_format)
 
@@ -1095,35 +1124,6 @@ def to_perioddelta(self, freq):
         m8delta = i8delta.view('m8[ns]')
         return TimedeltaArrayMixin(m8delta)
 
-    def astype(self, dtype, copy=True):
-        # We handle
-        #   --> datetime
-        #   --> period
-        # Super handles the rest.
-        dtype = pandas_dtype(dtype)
-
-        if (is_datetime64_ns_dtype(dtype) and
-                not is_dtype_equal(dtype, self.dtype)):
-            # GH 18951: datetime64_ns dtype but not equal means different tz
-            new_tz = getattr(dtype, 'tz', None)
-            if getattr(self.dtype, 'tz', None) is None:
-                return self.tz_localize(new_tz)
-            result = self.tz_convert(new_tz)
-            if new_tz is None:
-                # Do we want .astype('datetime64[ns]') to be an ndarray.
-                # The astype in Block._astype expects this to return an
-                # ndarray, but we could maybe work around it there.
-                result = result._data
-            return result
-        elif is_datetime64tz_dtype(self.dtype) and is_dtype_equal(self.dtype,
-                                                                  dtype):
-            if copy:
-                return self.copy()
-            return self
-        elif is_period_dtype(dtype):
-            return self.to_period(freq=dtype.freq)
-        return super(DatetimeArrayMixin, self).astype(dtype, copy)
-
     # -----------------------------------------------------------------
     # Properties - Vectorized Timestamp Properties/Methods
 

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -297,16 +297,45 @@ def _validate_fill_value(self, fill_value):
                              "Got '{got}'.".format(got=fill_value))
         return fill_value
 
+    def astype(self, dtype, copy=True):
+        # We handle
+        #   --> timedelta64[ns]
+        #   --> timedelta64
+        # DatetimeLikeArrayMixin super call handles other cases
+        dtype = pandas_dtype(dtype)
+
+        if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
+            # by pandas convention, converting to non-nano timedelta64
+            #  returns an int64-dtyped array with ints representing multiples
+            #  of the desired timedelta unit.  This is essentially division
+            if self._hasnans:
+                # avoid double-copying
+                result = self._data.astype(dtype, copy=False)
+                values = self._maybe_mask_results(result,
+                                                  fill_value=None,
+                                                  convert='float64')
+                return values
+            result = self._data.astype(dtype, copy=copy)
+            return result.astype('i8')
+        elif is_timedelta64_ns_dtype(dtype):
+            if copy:
+                return self.copy()
+            return self
+        return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
+
     # ----------------------------------------------------------------
     # Rendering Methods
 
-    def _format_native_types(self):
-        return self.astype(object)
-
     def _formatter(self, boxed=False):
         from pandas.io.formats.format import _get_format_timedelta64
         return _get_format_timedelta64(self, box=True)
 
+    def _format_native_types(self, na_rep='NaT', date_format=None):
+        from pandas.io.formats.format import _get_format_timedelta64
+
+        formatter = _get_format_timedelta64(self._data, na_rep)
+        return np.array([formatter(x) for x in self._data])
+
     # ----------------------------------------------------------------
     # Arithmetic Methods
 
@@ -755,27 +784,6 @@ def to_pytimedelta(self):
         """
         return tslibs.ints_to_pytimedelta(self.asi8)
 
-    def astype(self, dtype, copy=True):
-        # We handle
-        # --> timedelta64[ns]
-        # --> timedelta64
-        dtype = pandas_dtype(dtype)
-
-        if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype):
-            # essentially this is division
-            result = self._data.astype(dtype, copy=copy)
-            if self._hasnans:
-                values = self._maybe_mask_results(result,
-                                                  fill_value=None,
-                                                  convert='float64')
-                return values
-            return result.astype('i8')
-        elif is_timedelta64_ns_dtype(dtype):
-            if copy:
-                return self.copy()
-            return self
-        return super(TimedeltaArrayMixin, self).astype(dtype, copy=copy)
-
     days = _field_accessor("days", "days",
                            "Number of days for each element.")
     seconds = _field_accessor("seconds", "seconds",

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -14,7 +14,8 @@
     is_period_dtype, is_scalar, is_string_dtype, is_string_like_dtype,
     is_timedelta64_dtype, needs_i8_conversion, pandas_dtype)
 from .generic import (
-    ABCExtensionArray, ABCGeneric, ABCIndexClass, ABCMultiIndex, ABCSeries)
+    ABCDatetimeArray, ABCExtensionArray, ABCGeneric, ABCIndexClass,
+    ABCMultiIndex, ABCSeries, ABCTimedeltaArray)
 from .inference import is_list_like
 
 isposinf_scalar = libmissing.isposinf_scalar
@@ -108,7 +109,8 @@ def _isna_new(obj):
     elif isinstance(obj, ABCMultiIndex):
         raise NotImplementedError("isna is not defined for MultiIndex")
     elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass,
-                          ABCExtensionArray)):
+                          ABCExtensionArray,
+                          ABCDatetimeArray, ABCTimedeltaArray)):
         return _isna_ndarraylike(obj)
     elif isinstance(obj, ABCGeneric):
         return obj._constructor(obj._data.isna(func=isna))

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2386,7 +2386,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs):
                                   **kwargs)
 
     def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
-               index_label=None, chunksize=None, dtype=None):
+               index_label=None, chunksize=None, dtype=None, method=None):
         """
         Write records stored in a DataFrame to a SQL database.
 
@@ -2424,6 +2424,17 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
             Specifying the datatype for columns. The keys should be the column
             names and the values should be the SQLAlchemy types or strings for
             the sqlite3 legacy mode.
+        method : {None, 'multi', callable}, default None
+            Controls the SQL insertion clause used:
+
+            * None : Uses standard SQL ``INSERT`` clause (one per row).
+            * 'multi': Pass multiple values in a single ``INSERT`` clause.
+            * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+            Details and a sample callable implementation can be found in the
+            section :ref:`insert method <io.sql.method>`.
+
+            .. versionadded:: 0.24.0
 
         Raises
         ------
@@ -2505,7 +2516,7 @@ def to_sql(self, name, con, schema=None, if_exists='fail', index=True,
         from pandas.io import sql
         sql.to_sql(self, name, con, schema=schema, if_exists=if_exists,
                    index=index, index_label=index_label, chunksize=chunksize,
-                   dtype=dtype)
+                   dtype=dtype, method=method)
 
     def to_pickle(self, path, compression='infer',
                   protocol=pkl.HIGHEST_PROTOCOL):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -712,8 +712,9 @@ def view(self, cls=None):
         Parameters
         ----------
         dtype : numpy dtype or pandas type
-            Note that any integer `dtype` is treated as ``'int64'``,
-            regardless of the sign and size.
+            Note that any signed integer `dtype` is treated as ``'int64'``,
+            and any unsigned integer `dtype` is treated as ``'uint64'``,
+            regardless of the size.
         copy : bool, default True
             By default, astype always returns a newly allocated object.
             If copy is set to False and internal requirements on dtype are