Skip to content

Commit

Permalink
Make Version hashable (#5238)
Browse files Browse the repository at this point in the history
* Make Version hashable

* Remove Version.match (unused method)
  • Loading branch information
mariosasko committed Nov 14, 2022
1 parent 1b81805 commit bde7504
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 39 deletions.
53 changes: 15 additions & 38 deletions src/datasets/utils/version.py
Expand Up @@ -18,14 +18,14 @@
import dataclasses
import re
from dataclasses import dataclass
from functools import total_ordering
from typing import Optional, Union


_VERSION_TMPL = r"^(?P<major>{v})" r"\.(?P<minor>{v})" r"\.(?P<patch>{v})$"
_VERSION_WILDCARD_REG = re.compile(_VERSION_TMPL.format(v=r"\d+|\*"))
_VERSION_RESOLVED_REG = re.compile(_VERSION_TMPL.format(v=r"\d+"))
_VERSION_REG = re.compile(r"^(?P<major>\d+)" r"\.(?P<minor>\d+)" r"\.(?P<patch>\d+)$")


@total_ordering
@dataclass
class Version:
"""Dataset version MAJOR.MINOR.PATCH.
Expand Down Expand Up @@ -55,7 +55,7 @@ class Version:
patch: Optional[Union[str, int]] = None

def __post_init__(self):
self.major, self.minor, self.patch = _str_to_version(self.version_str)
self.major, self.minor, self.patch = _str_to_version_tuple(self.version_str)

def __repr__(self):
return f"{self.tuple[0]}.{self.tuple[1]}.{self.tuple[2]}"
Expand All @@ -79,34 +79,12 @@ def __eq__(self, other):
else:
return self.tuple == other.tuple

def __ne__(self, other):
return not self.__eq__(other)

def __lt__(self, other):
other = self._validate_operand(other)
return self.tuple < other.tuple

def __le__(self, other):
other = self._validate_operand(other)
return self.tuple <= other.tuple

def __gt__(self, other):
other = self._validate_operand(other)
return self.tuple > other.tuple

def __ge__(self, other):
other = self._validate_operand(other)
return self.tuple >= other.tuple

def match(self, other_version):
"""Returns True if other_version matches.
Args:
other_version: string, of the form "x[.y[.x]]" where {x,y,z} can be a
number or a wildcard.
"""
major, minor, patch = _str_to_version(other_version, allow_wildcard=True)
return major in [self.major, "*"] and minor in [self.minor, "*"] and patch in [self.patch, "*"]
def __hash__(self):
return hash(_version_tuple_to_str(self.tuple))

@classmethod
def from_dict(cls, dic):
Expand All @@ -117,15 +95,14 @@ def _to_yaml_string(self) -> str:
return self.version_str


def _str_to_version(version_str, allow_wildcard=False):
def _str_to_version_tuple(version_str):
"""Return the tuple (major, minor, patch) version extracted from the str."""
reg = _VERSION_WILDCARD_REG if allow_wildcard else _VERSION_RESOLVED_REG
res = reg.match(version_str)
res = _VERSION_REG.match(version_str)
if not res:
msg = f"Invalid version '{version_str}'. Format should be x.y.z"
if allow_wildcard:
msg += " with {x,y,z} being digits or wildcard."
else:
msg += " with {x,y,z} being digits."
raise ValueError(msg)
return tuple(v if v == "*" else int(v) for v in [res.group("major"), res.group("minor"), res.group("patch")])
raise ValueError(f"Invalid version '{version_str}'. Format should be x.y.z with {{x,y,z}} being digits.")
return tuple(int(v) for v in [res.group("major"), res.group("minor"), res.group("patch")])


def _version_tuple_to_str(version_tuple):
"""Return the str version from the version tuple (major, minor, patch)."""
return ".".join(str(v) for v in version_tuple)
3 changes: 2 additions & 1 deletion tests/test_version.py
Expand Up @@ -16,7 +16,8 @@
(None, False),
],
)
def test_version_equalities(other, expected_equality):
def test_version_equality_and_hash(other, expected_equality):
version = Version("1.0.0")
assert (version == other) is expected_equality
assert (version != other) is not expected_equality
assert (hash(version) == hash(other)) is expected_equality

1 comment on commit bde7504

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008791 / 0.011353 (-0.002562) 0.004641 / 0.011008 (-0.006367) 0.100391 / 0.038508 (0.061883) 0.032053 / 0.023109 (0.008944) 0.304398 / 0.275898 (0.028500) 0.376397 / 0.323480 (0.052917) 0.007505 / 0.007986 (-0.000480) 0.004835 / 0.004328 (0.000506) 0.077002 / 0.004250 (0.072751) 0.037909 / 0.037052 (0.000856) 0.313728 / 0.258489 (0.055239) 0.350539 / 0.293841 (0.056698) 0.037492 / 0.128546 (-0.091054) 0.014552 / 0.075646 (-0.061095) 0.325051 / 0.419271 (-0.094220) 0.044654 / 0.043533 (0.001121) 0.307866 / 0.255139 (0.052727) 0.331527 / 0.283200 (0.048327) 0.088675 / 0.141683 (-0.053008) 1.480155 / 1.452155 (0.028000) 1.507464 / 1.492716 (0.014748)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.194849 / 0.018006 (0.176843) 0.441031 / 0.000490 (0.440542) 0.001212 / 0.000200 (0.001012) 0.000075 / 0.000054 (0.000020)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023675 / 0.037411 (-0.013736) 0.095056 / 0.014526 (0.080530) 0.105448 / 0.176557 (-0.071108) 0.140146 / 0.737135 (-0.596989) 0.106957 / 0.296338 (-0.189382)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.415382 / 0.215209 (0.200172) 4.126971 / 2.077655 (2.049316) 1.874806 / 1.504120 (0.370686) 1.682002 / 1.541195 (0.140808) 1.749537 / 1.468490 (0.281047) 0.689640 / 4.584777 (-3.895137) 3.341567 / 3.745712 (-0.404145) 2.985051 / 5.269862 (-2.284810) 1.558213 / 4.565676 (-3.007463) 0.080845 / 0.424275 (-0.343431) 0.011916 / 0.007607 (0.004309) 0.528399 / 0.226044 (0.302355) 5.288938 / 2.268929 (3.020009) 2.324948 / 55.444624 (-53.119677) 1.960174 / 6.876477 (-4.916303) 2.039365 / 2.142072 (-0.102708) 0.804088 / 4.805227 (-4.001140) 0.147084 / 6.500664 (-6.353580) 0.063564 / 0.075469 (-0.011905)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.552270 / 1.841788 (-0.289518) 13.267146 / 8.074308 (5.192838) 26.368772 / 10.191392 (16.177380) 0.867935 / 0.680424 (0.187511) 0.616967 / 0.534201 (0.082766) 0.386486 / 0.579283 (-0.192797) 0.401526 / 0.434364 (-0.032838) 0.238472 / 0.540337 (-0.301866) 0.235140 / 1.386936 (-1.151796)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006854 / 0.011353 (-0.004499) 0.004617 / 0.011008 (-0.006391) 0.097636 / 0.038508 (0.059128) 0.028471 / 0.023109 (0.005362) 0.346824 / 0.275898 (0.070926) 0.383489 / 0.323480 (0.060009) 0.005883 / 0.007986 (-0.002103) 0.004934 / 0.004328 (0.000606) 0.074822 / 0.004250 (0.070571) 0.034951 / 0.037052 (-0.002101) 0.346923 / 0.258489 (0.088434) 0.390326 / 0.293841 (0.096485) 0.031883 / 0.128546 (-0.096663) 0.011741 / 0.075646 (-0.063906) 0.317912 / 0.419271 (-0.101359) 0.040855 / 0.043533 (-0.002678) 0.343263 / 0.255139 (0.088124) 0.371610 / 0.283200 (0.088410) 0.092025 / 0.141683 (-0.049657) 1.533109 / 1.452155 (0.080955) 1.630519 / 1.492716 (0.137803)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.225118 / 0.018006 (0.207112) 0.418104 / 0.000490 (0.417614) 0.004539 / 0.000200 (0.004339) 0.000083 / 0.000054 (0.000029)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025749 / 0.037411 (-0.011663) 0.104856 / 0.014526 (0.090330) 0.112603 / 0.176557 (-0.063953) 0.148155 / 0.737135 (-0.588980) 0.114787 / 0.296338 (-0.181552)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.440025 / 0.215209 (0.224816) 4.406242 / 2.077655 (2.328587) 2.071048 / 1.504120 (0.566928) 1.869892 / 1.541195 (0.328697) 1.969293 / 1.468490 (0.500803) 0.693298 / 4.584777 (-3.891479) 3.383328 / 3.745712 (-0.362384) 3.078037 / 5.269862 (-2.191824) 1.354447 / 4.565676 (-3.211230) 0.081563 / 0.424275 (-0.342712) 0.012092 / 0.007607 (0.004485) 0.544867 / 0.226044 (0.318823) 5.481963 / 2.268929 (3.213035) 2.577710 / 55.444624 (-52.866914) 2.230735 / 6.876477 (-4.645741) 2.286849 / 2.142072 (0.144776) 0.797119 / 4.805227 (-4.008109) 0.151184 / 6.500664 (-6.349481) 0.068172 / 0.075469 (-0.007297)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.570003 / 1.841788 (-0.271785) 12.521704 / 8.074308 (4.447396) 12.344371 / 10.191392 (2.152979) 0.905896 / 0.680424 (0.225472) 0.639997 / 0.534201 (0.105796) 0.371715 / 0.579283 (-0.207568) 0.375202 / 0.434364 (-0.059162) 0.217864 / 0.540337 (-0.322473) 0.223731 / 1.386936 (-1.163205)

Please sign in to comment.