Merge pull request #867 from koordinates/numeric-fix

Fix: whole-number numerics read from PG were losing trailing zeroes
koordinates · Jun 13, 2023 · 4fbaba8 · 4fbaba8
2 parents bf5c4b0 + 9da20b8
commit 4fbaba8
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
 - Fixes a bug where git-lfs (and potentially other subprocesses) wasn't working reliably with helper mode, affects macOS and Linux. [#853](https://github.com/koordinates/kart/issues/853)
 - `import` now informs the user to use `add-dataset` instead if they try to import a table that is already inside the working copy. [#249](https://github.com/koordinates/kart/issues/249)
 - Fixes a bug where datasets with no CRS couldn't be checked out into a Geopackage working copy. [#855](https://github.com/koordinates/kart/issues/855)
+- Fixes a bad data-loss bug where whole-number numerics read from a Postgres database were losing trailing zeroes (eg 100 was read as 1). [#863](https://github.com/koordinates/kart/issues/863)
 
 ## 0.12.3
 

diff --git a/kart/sqlalchemy/adapter/mysql.py b/kart/sqlalchemy/adapter/mysql.py
@@ -1,5 +1,3 @@
-import decimal
-
 import sqlalchemy as sa
 from kart import crs_util
 from kart.exceptions import NotYetImplemented
@@ -324,7 +322,7 @@ def _type_def_for_column_schema(self, col, dataset=None):
         elif col.data_type == "date":
             return DateType
         elif col.data_type == "numeric":
-            return NumericType
+            return TextType
         elif col.data_type == "time":
             return TimeType
         elif col.data_type == "timestamp":
@@ -394,18 +392,6 @@ def sql_read(self, col):
         return sa.cast(col, DOUBLE)
 
 
-@aliased_converter_type
-class NumericType(ConverterType):
-    """ConverterType to read numerics as text. They are stored in MySQL as NUMERIC but we read them back as text."""
-
-    def python_postread(self, value):
-        return (
-            str(value).rstrip("0").rstrip(".")
-            if isinstance(value, decimal.Decimal)
-            else value
-        )
-
-
 @aliased_converter_type
 class TimeType(ConverterType):
     # ConverterType to read Times as text. They are stored in MySQL as Times but we read them back as text.
@@ -435,7 +421,10 @@ def sql_read(self, col):
 
 @aliased_converter_type
 class TextType(ConverterType):
-    """ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
+    """
+    ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
+    sometimes rarer things like UUIDs.
+    """
 
     def python_postread(self, value):
         return str(value) if value is not None else None
diff --git a/kart/sqlalchemy/adapter/postgis.py b/kart/sqlalchemy/adapter/postgis.py
@@ -1,5 +1,5 @@
-import decimal
 import re
+import struct
 
 from osgeo.osr import SpatialReference
 from psycopg2.extensions import Binary
@@ -368,10 +368,12 @@ def _type_def_for_column_schema(cls, col, dataset=None):
             return BlobType
         elif col.data_type == "date":
             return DateType
+        elif col.data_type == "float":
+            return FloatType(col.get("size", 32))
         elif col.data_type == "interval":
             return IntervalType
         elif col.data_type == "numeric":
-            return NumericType
+            return TextType
         elif col.data_type == "time":
             return TimeType
         elif col.data_type == "timestamp":
@@ -409,23 +411,39 @@ def sql_read(self, column):
 
 
 @aliased_converter_type
-class IntervalType(ConverterType):
-    """ConverterType to that casts intervals to text - ISO8601 mode is set for durations so this does what we want."""
+class FloatType(ConverterType):
+    """
+    When reading floats from Postgres, the float data is (apparently) read using decimal, instead of the storage format (IEEE binary).
+    However, PG ensures that the number of decimal digits output is sufficient to uniquely describe the IEEE float32 or float64,
+    so no data is lost.
+    By the time we get to python_postread, value is a Python float (which is a float64) and contains the stored data.
+    The float64 values at this point as exactly as we would want. However, the float32 values have extra bits as a result
+    of briefly being transferred in decimal, which means they are differ from the stored float32 value by a tiny amount -
+    less than the precision of a float32. Rounding the result to be a float32 and so setting all the extra bits back to zero
+    means that the float32 we store in Kart is the same as what was stored in PG, which is what we want, as opposed to storing
+    the float32 plus the decimal-artifact extra bits.
+    """
 
-    def sql_read(self, column):
-        return sa.cast(column, TEXT)
+    float32_packer = struct.Struct("@f")
+
+    def __init__(self, size):
+        self.size = size
+
+    def python_postread(self, value):
+        if value is not None and self.size == 32:
+            return self.float32_packer.unpack(self.float32_packer.pack(value))[0]
+        return value
 
 
 @aliased_converter_type
-class NumericType(ConverterType):
-    """ConverterType to read numerics as text. They are stored in PG as NUMERIC but we read them back as text."""
+class IntervalType(ConverterType):
+    """
+    ConverterType to that casts intervals to text - ISO8601 mode is set for durations so this does what we want.
+    An ISO8601 duration (which is what we store in the Kart model) looks something like this: P1DT12H36M
+    """
 
-    def python_postread(self, value):
-        return (
-            str(value).rstrip("0").rstrip(".")
-            if isinstance(value, decimal.Decimal)
-            else value
-        )
+    def sql_read(self, column):
+        return sa.cast(column, TEXT)
 
 
 @aliased_converter_type
@@ -460,7 +478,10 @@ def sql_read(self, column):
 
 @aliased_converter_type
 class TextType(ConverterType):
-    """ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
+    """
+    ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
+    sometimes rarer things like UUIDs.
+    """
 
     def python_postread(self, value):
         return str(value) if value is not None else None

diff --git a/kart/sqlalchemy/adapter/sqlserver.py b/kart/sqlalchemy/adapter/sqlserver.py
@@ -1,5 +1,3 @@
-import decimal
-
 import sqlalchemy as sa
 from kart import crs_util
 from kart.geometry import Geometry
@@ -357,7 +355,7 @@ def _type_def_for_column_schema(cls, col, dataset):
         elif col.data_type == "date":
             return DateType
         elif col.data_type == "numeric":
-            return NumericType
+            return TextType
         elif col.data_type == "time":
             return TimeType
         elif col.data_type == "timestamp":
@@ -444,18 +442,6 @@ def sql_read(self, column):
         return Function("FORMAT", column, "yyyy-MM-dd", type_=self)
 
 
-@aliased_converter_type
-class NumericType(ConverterType):
-    """ConverterType to read numerics as text. They are stored in MS as NUMERIC but we read them back as text."""
-
-    def python_postread(self, value):
-        return (
-            str(value).rstrip("0").rstrip(".")
-            if isinstance(value, decimal.Decimal)
-            else value
-        )
-
-
 @aliased_converter_type
 class TimeType(ConverterType):
     # ConverterType to read times as text. They are stored in MS as TIME but we read them back as text.
@@ -488,7 +474,10 @@ def sql_read(self, column):
 
 @aliased_converter_type
 class TextType(ConverterType):
-    """ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
+    """
+    ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
+    sometimes rarer things like UUIDs.
+    """
 
     def python_postread(self, value):
         return str(value) if value is not None else None
diff --git a/kart/sqlalchemy/mysql.py b/kart/sqlalchemy/mysql.py
@@ -20,7 +20,7 @@ def _on_checkout(mysql_conn, connection_record, connection_proxy):
             dbcur = mysql_conn.cursor()
             # +00:00 is UTC, but unlike UTC, it works even without a timezone DB.
             dbcur.execute("SET time_zone='+00:00';")
-            dbcur.execute("SET sql_mode = 'ANSI_QUOTES';")
+            dbcur.execute("SET sql_mode = 'ANSI_QUOTES,NO_AUTO_VALUE_ON_ZERO';")
 
         url = urlsplit(msurl)
         if url.scheme != cls.CANONICAL_SCHEME:

diff --git a/tests/data/types.tgz b/tests/data/types.tgz
diff --git a/tests/test_working_copy_gpkg.py b/tests/test_working_copy_gpkg.py
@@ -906,6 +906,8 @@ def test_values_roundtrip(data_working_copy, cli_runner):
     # If values roundtripping code isn't working for certain types,
     # we could get spurious diffs on those values.
     with data_working_copy("types") as (repo_path, wc_path):
+        # Geopackage can store any type of numerics it likes since it approximates them as text.
+        r = cli_runner.invoke(["checkout", "unconstrained-numerics"])
         repo = KartRepo(repo_path)
         with repo.working_copy.tabular.session() as sess:
             # We don't diff values unless they're marked as dirty in the WC - move the row to make it dirty.

diff --git a/tests/test_working_copy_postgis.py b/tests/test_working_copy_postgis.py
@@ -598,7 +598,9 @@ def test_values_roundtrip(data_archive, cli_runner, new_postgis_db_schema):
 
         with new_postgis_db_schema() as (postgres_url, postgres_schema):
             repo.config["kart.workingcopy.location"] = postgres_url
-            r = cli_runner.invoke(["checkout"])
+            # Postgres default NUMERIC (no precision or scale provided) can store decimal places, unlike in other DBs.
+            # Make sure we roundtrip them properly.
+            r = cli_runner.invoke(["checkout", "unconstrained-numerics"])
 
             with repo.working_copy.tabular.session() as sess:
                 # We don't diff values unless they're marked as dirty in the WC - move the row to make it dirty.