Fix bug related to boolean in GAP dataset. (#680)

* fix bug related to boolean The value in row["A-coref"] and row["B-coref"] is 'TRUE' or 'FALSE'. This type is `string`, then bool('FALSE') is equal to True in Python. So, both rows are transformed into `True` now. So, I modified this problem. * modified single quotes to double quotes * update gap information in datasets/gap/dataset_infos.json
huggingface · Sep 29, 2020 · c1ed514 · c1ed514 · github-actions · Sep 29, 2020
1 parent a3576b4
commit c1ed514
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/datasets/gap/dataset_infos.json b/datasets/gap/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "\nGAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of \n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by \nGoogle AI Language for the evaluation of coreference resolution in practical \napplications.\n", "citation": "\n@article{DBLP:journals/corr/abs-1810-05201,\n  author    = {Kellie Webster and\n               Marta Recasens and\n               Vera Axelrod and\n               Jason Baldridge},\n  title     = {Mind the {GAP:} {A} Balanced Corpus of Gendered Ambiguous Pronouns},\n  journal   = {CoRR},\n  volume    = {abs/1810.05201},\n  year      = {2018},\n  url       = {http://arxiv.org/abs/1810.05201},\n  archivePrefix = {arXiv},\n  eprint    = {1810.05201},\n  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},\n  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1810-05201},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/google-research-datasets/gap-coreference", "license": "", "features": {"ID": {"dtype": "string", "id": null, "_type": "Value"}, "Text": {"dtype": "string", "id": null, "_type": "Value"}, "Pronoun": {"dtype": "string", "id": null, "_type": "Value"}, "Pronoun-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "A": {"dtype": "string", "id": null, "_type": "Value"}, "A-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "A-coref": {"dtype": "bool", "id": null, "_type": "Value"}, "B": {"dtype": "string", "id": null, "_type": "Value"}, "B-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "B-coref": {"dtype": "bool", "id": null, "_type": "Value"}, "URL": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "gap", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 1093462, "num_examples": 2000, "dataset_name": "gap"}, "train": {"name": "train", "num_bytes": 1098623, "num_examples": 2000, "dataset_name": "gap"}, "validation": {"name": "validation", "num_bytes": 249013, "num_examples": 454, "dataset_name": "gap"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv": {"num_bytes": 1080993, "checksum": "b9a01434fcf58d8c2f9bc762480c27e58ce466cf1ffe8b09cfecbc7a20d2d634"}, "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv": {"num_bytes": 245089, "checksum": "2d784f66b390404f554704b9aef6dcde8845e79dda9886b8391cf7e9a24fdb98"}, "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv": {"num_bytes": 1075889, "checksum": "1c35e36d5b14f6313ec3f6cd67b275de282595dd59e59390e00cfff9897a6819"}}, "download_size": 2401971, "dataset_size": 2441098, "size_in_bytes": 4843069}}
+{"default": {"description": "\nGAP is a gender-balanced dataset containing 8,908 coreference-labeled pairs of\n(ambiguous pronoun, antecedent name), sampled from Wikipedia and released by\nGoogle AI Language for the evaluation of coreference resolution in practical\napplications.\n", "citation": "\n@article{DBLP:journals/corr/abs-1810-05201,\n  author    = {Kellie Webster and\n               Marta Recasens and\n               Vera Axelrod and\n               Jason Baldridge},\n  title     = {Mind the {GAP:} {A} Balanced Corpus of Gendered Ambiguous Pronouns},\n  journal   = {CoRR},\n  volume    = {abs/1810.05201},\n  year      = {2018},\n  url       = {http://arxiv.org/abs/1810.05201},\n  archivePrefix = {arXiv},\n  eprint    = {1810.05201},\n  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},\n  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1810-05201},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "homepage": "https://github.com/google-research-datasets/gap-coreference", "license": "", "features": {"ID": {"dtype": "string", "id": null, "_type": "Value"}, "Text": {"dtype": "string", "id": null, "_type": "Value"}, "Pronoun": {"dtype": "string", "id": null, "_type": "Value"}, "Pronoun-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "A": {"dtype": "string", "id": null, "_type": "Value"}, "A-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "A-coref": {"dtype": "bool", "id": null, "_type": "Value"}, "B": {"dtype": "string", "id": null, "_type": "Value"}, "B-offset": {"dtype": "int32", "id": null, "_type": "Value"}, "B-coref": {"dtype": "bool", "id": null, "_type": "Value"}, "URL": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "gap", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1095623, "num_examples": 2000, "dataset_name": "gap"}, "validation": {"name": "validation", "num_bytes": 248329, "num_examples": 454, "dataset_name": "gap"}, "test": {"name": "test", "num_bytes": 1090462, "num_examples": 2000, "dataset_name": "gap"}}, "download_checksums": {"https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv": {"num_bytes": 1080993, "checksum": "b9a01434fcf58d8c2f9bc762480c27e58ce466cf1ffe8b09cfecbc7a20d2d634"}, "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv": {"num_bytes": 245089, "checksum": "2d784f66b390404f554704b9aef6dcde8845e79dda9886b8391cf7e9a24fdb98"}, "https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv": {"num_bytes": 1075889, "checksum": "1c35e36d5b14f6313ec3f6cd67b275de282595dd59e59390e00cfff9897a6819"}}, "download_size": 2401971, "post_processing_size": null, "dataset_size": 2434414, "size_in_bytes": 4836385}}
diff --git a/datasets/gap/gap.py b/datasets/gap/gap.py
@@ -111,8 +111,8 @@ def _generate_examples(self, filepath):
         with open(filepath, encoding="utf-8") as tsvfile:
             reader = csv.DictReader(tsvfile, dialect="excel-tab")
             for i, row in enumerate(reader):
-                row["A-coref"] = bool(row["A-coref"])
-                row["B-coref"] = bool(row["B-coref"])
+                row["A-coref"] = row["A-coref"] == "TRUE"
+                row["B-coref"] = row["B-coref"] == "TRUE"
                 row["A-offset"] = int(row["A-offset"])
                 row["B-offset"] = int(row["B-offset"])
                 row["Pronoun-offset"] = int(row["Pronoun-offset"])