Skip to content

Commit

Permalink
Merge pull request #867 from koordinates/numeric-fix
Browse files Browse the repository at this point in the history
Fix: whole-number numerics read from PG were losing trailing zeroes
  • Loading branch information
olsen232 committed Jun 13, 2023
2 parents bf5c4b0 + 9da20b8 commit 4fbaba8
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Fixes a bug where git-lfs (and potentially other subprocesses) wasn't working reliably with helper mode, affects macOS and Linux. [#853](https://github.com/koordinates/kart/issues/853)
- `import` now informs the user to use `add-dataset` instead if they try to import a table that is already inside the working copy. [#249](https://github.com/koordinates/kart/issues/249)
- Fixes a bug where datasets with no CRS couldn't be checked out into a Geopackage working copy. [#855](https://github.com/koordinates/kart/issues/855)
- Fixes a bad data-loss bug where whole-number numerics read from a Postgres database were losing trailing zeroes (eg 100 was read as 1). [#863](https://github.com/koordinates/kart/issues/863)

## 0.12.3

Expand Down
21 changes: 5 additions & 16 deletions kart/sqlalchemy/adapter/mysql.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import decimal

import sqlalchemy as sa
from kart import crs_util
from kart.exceptions import NotYetImplemented
Expand Down Expand Up @@ -324,7 +322,7 @@ def _type_def_for_column_schema(self, col, dataset=None):
elif col.data_type == "date":
return DateType
elif col.data_type == "numeric":
return NumericType
return TextType
elif col.data_type == "time":
return TimeType
elif col.data_type == "timestamp":
Expand Down Expand Up @@ -394,18 +392,6 @@ def sql_read(self, col):
return sa.cast(col, DOUBLE)


@aliased_converter_type
class NumericType(ConverterType):
"""ConverterType to read numerics as text. They are stored in MySQL as NUMERIC but we read them back as text."""

def python_postread(self, value):
return (
str(value).rstrip("0").rstrip(".")
if isinstance(value, decimal.Decimal)
else value
)


@aliased_converter_type
class TimeType(ConverterType):
# ConverterType to read Times as text. They are stored in MySQL as Times but we read them back as text.
Expand Down Expand Up @@ -435,7 +421,10 @@ def sql_read(self, col):

@aliased_converter_type
class TextType(ConverterType):
"""ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
"""
ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
sometimes rarer things like UUIDs.
"""

def python_postread(self, value):
return str(value) if value is not None else None
51 changes: 36 additions & 15 deletions kart/sqlalchemy/adapter/postgis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import decimal
import re
import struct

from osgeo.osr import SpatialReference
from psycopg2.extensions import Binary
Expand Down Expand Up @@ -368,10 +368,12 @@ def _type_def_for_column_schema(cls, col, dataset=None):
return BlobType
elif col.data_type == "date":
return DateType
elif col.data_type == "float":
return FloatType(col.get("size", 32))
elif col.data_type == "interval":
return IntervalType
elif col.data_type == "numeric":
return NumericType
return TextType
elif col.data_type == "time":
return TimeType
elif col.data_type == "timestamp":
Expand Down Expand Up @@ -409,23 +411,39 @@ def sql_read(self, column):


@aliased_converter_type
class IntervalType(ConverterType):
"""ConverterType to that casts intervals to text - ISO8601 mode is set for durations so this does what we want."""
class FloatType(ConverterType):
"""
When reading floats from Postgres, the float data is (apparently) read using decimal, instead of the storage format (IEEE binary).
However, PG ensures that the number of decimal digits output is sufficient to uniquely describe the IEEE float32 or float64,
so no data is lost.
By the time we get to python_postread, value is a Python float (which is a float64) and contains the stored data.
The float64 values at this point as exactly as we would want. However, the float32 values have extra bits as a result
of briefly being transferred in decimal, which means they are differ from the stored float32 value by a tiny amount -
less than the precision of a float32. Rounding the result to be a float32 and so setting all the extra bits back to zero
means that the float32 we store in Kart is the same as what was stored in PG, which is what we want, as opposed to storing
the float32 plus the decimal-artifact extra bits.
"""

def sql_read(self, column):
return sa.cast(column, TEXT)
float32_packer = struct.Struct("@f")

def __init__(self, size):
self.size = size

def python_postread(self, value):
if value is not None and self.size == 32:
return self.float32_packer.unpack(self.float32_packer.pack(value))[0]
return value


@aliased_converter_type
class NumericType(ConverterType):
"""ConverterType to read numerics as text. They are stored in PG as NUMERIC but we read them back as text."""
class IntervalType(ConverterType):
"""
ConverterType to that casts intervals to text - ISO8601 mode is set for durations so this does what we want.
An ISO8601 duration (which is what we store in the Kart model) looks something like this: P1DT12H36M
"""

def python_postread(self, value):
return (
str(value).rstrip("0").rstrip(".")
if isinstance(value, decimal.Decimal)
else value
)
def sql_read(self, column):
return sa.cast(column, TEXT)


@aliased_converter_type
Expand Down Expand Up @@ -460,7 +478,10 @@ def sql_read(self, column):

@aliased_converter_type
class TextType(ConverterType):
"""ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
"""
ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
sometimes rarer things like UUIDs.
"""

def python_postread(self, value):
return str(value) if value is not None else None
Expand Down
21 changes: 5 additions & 16 deletions kart/sqlalchemy/adapter/sqlserver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import decimal

import sqlalchemy as sa
from kart import crs_util
from kart.geometry import Geometry
Expand Down Expand Up @@ -357,7 +355,7 @@ def _type_def_for_column_schema(cls, col, dataset):
elif col.data_type == "date":
return DateType
elif col.data_type == "numeric":
return NumericType
return TextType
elif col.data_type == "time":
return TimeType
elif col.data_type == "timestamp":
Expand Down Expand Up @@ -444,18 +442,6 @@ def sql_read(self, column):
return Function("FORMAT", column, "yyyy-MM-dd", type_=self)


@aliased_converter_type
class NumericType(ConverterType):
"""ConverterType to read numerics as text. They are stored in MS as NUMERIC but we read them back as text."""

def python_postread(self, value):
return (
str(value).rstrip("0").rstrip(".")
if isinstance(value, decimal.Decimal)
else value
)


@aliased_converter_type
class TimeType(ConverterType):
# ConverterType to read times as text. They are stored in MS as TIME but we read them back as text.
Expand Down Expand Up @@ -488,7 +474,10 @@ def sql_read(self, column):

@aliased_converter_type
class TextType(ConverterType):
"""ConverterType to that casts everything to text in the Python layer. Handles things like UUIDs."""
"""
ConverterType to that casts everything to text in the Python layer. Handles NUMERICs (which Kart stores as text),
sometimes rarer things like UUIDs.
"""

def python_postread(self, value):
return str(value) if value is not None else None
2 changes: 1 addition & 1 deletion kart/sqlalchemy/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def _on_checkout(mysql_conn, connection_record, connection_proxy):
dbcur = mysql_conn.cursor()
# +00:00 is UTC, but unlike UTC, it works even without a timezone DB.
dbcur.execute("SET time_zone='+00:00';")
dbcur.execute("SET sql_mode = 'ANSI_QUOTES';")
dbcur.execute("SET sql_mode = 'ANSI_QUOTES,NO_AUTO_VALUE_ON_ZERO';")

url = urlsplit(msurl)
if url.scheme != cls.CANONICAL_SCHEME:
Expand Down
Binary file modified tests/data/types.tgz
Binary file not shown.
2 changes: 2 additions & 0 deletions tests/test_working_copy_gpkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,8 @@ def test_values_roundtrip(data_working_copy, cli_runner):
# If values roundtripping code isn't working for certain types,
# we could get spurious diffs on those values.
with data_working_copy("types") as (repo_path, wc_path):
# Geopackage can store any type of numerics it likes since it approximates them as text.
r = cli_runner.invoke(["checkout", "unconstrained-numerics"])
repo = KartRepo(repo_path)
with repo.working_copy.tabular.session() as sess:
# We don't diff values unless they're marked as dirty in the WC - move the row to make it dirty.
Expand Down
4 changes: 3 additions & 1 deletion tests/test_working_copy_postgis.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,9 @@ def test_values_roundtrip(data_archive, cli_runner, new_postgis_db_schema):

with new_postgis_db_schema() as (postgres_url, postgres_schema):
repo.config["kart.workingcopy.location"] = postgres_url
r = cli_runner.invoke(["checkout"])
# Postgres default NUMERIC (no precision or scale provided) can store decimal places, unlike in other DBs.
# Make sure we roundtrip them properly.
r = cli_runner.invoke(["checkout", "unconstrained-numerics"])

with repo.working_copy.tabular.session() as sess:
# We don't diff values unless they're marked as dirty in the WC - move the row to make it dirty.
Expand Down

0 comments on commit 4fbaba8

Please sign in to comment.