huggingface · lhoestq · May 6, 2021 · Apr 19, 2021 · Apr 21, 2021 · Apr 21, 2021
diff --git a/datasets/bbaw_egyptian/README.md b/datasets/bbaw_egyptian/README.md
@@ -0,0 +1,188 @@
+---
+annotations_creators:
+- expert-generated
+language_creators:
+- found
+languages:
+- de
+- en
+- egy
+licenses:
+- cc-by-4.0
+multilinguality:
+- multilingual
+size_categories:
+- 100K<n<1M
+source_datasets:
+- extended|wikipedia
+task_categories:
+- conditional-text-generation
+task_ids:
+- machine-translation
+---
+
+# Dataset Card for "bbaw_egyptian"
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks](#supported-tasks)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+  - [Data Splits Sample Size](#data-splits-sample-size)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+  - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Homepage:** [https://edoc.bbaw.de/frontdoor/index/index/docId/2919](https://edoc.bbaw.de/frontdoor/index/index/docId/2919)
+- **Repository:** [Github](https://phiwi.github.io/all.json)
+- **Paper:** [Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian Hieroglyph](https://zenodo.org/record/3524924)
+- **Point of Contact:** [Philipp Wiesenbach](https://www.cl.uni-heidelberg.de/~wiesenbach/index.html)
+- **Size of downloaded dataset files:** 34 MB
+
+
+### Dataset Summary
+
+This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation as used in the paper [Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian Hieroglyph](https://zenodo.org/record/3524924). The data triples are extracted from the [digital corpus of Egyptian texts](https://edoc.bbaw.de/frontdoor/index/index/docId/2919) compiled by the project "Strukturen und Transformationen des Wortschatzes der ägyptischen Sprache".
+
+### Supported Tasks
+
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Languages
+
+The dataset consists of parallel tripls of
+- `hieroglyphs`: [Encoding of the hieroglyphs with the [Gardiner's sign list](https://en.wikipedia.org/wiki/Gardiner%27s_sign_list)
+- `transcription`: Transliteration of the above mentioned hieroglyphs with a [transliteration scheme](https://en.wikipedia.org/wiki/Transliteration_of_Ancient_Egyptian)
+- `translation`: Translation in mostly German language (with some English mixed in)
+## Dataset Structure
+
+The dataset is not divided into 'train', 'dev' and 'test' splits as it was not built for competitive purposes and we encourage all scientists to use individual partitioning schemes to suit their needs (due to the low resource setting it might be advisable to use cross validation anyway). The only available split 'all' therefore comprises the full 100,708 translation triples, 35,503 of which possess hieroglyphic encodings (the remaining 65,205 triples have empty `hieroglyph` entries).
+
+### Data Instances
+
+An example of a data triple looks the following way:
+
+```
+{
+    "transcription": "n rḏi̯(.w) gꜣ =j r dbḥ.t m pr-ḥḏ",
+    "translation": "I was not let to suffer lack in the treasury with respect to what was needed;",
+    "hieroglyphs": "D35 D21 -D37 G1&W11 -V32B A1 D21 D46 -D58 *V28 -F18 *X1 -A2 G17 [? *O2 *?]"
+}
+
+```
+
+*Important*: Only about a third of the instance actually cover hieroglyphic encodings (the rest is the empty string `""`) as the leftover encodings have not yet been incorporated into the BBAW's project database.
+
+### Data Fields
+
+#### plain_text
+- `transcription`: a `string` feature.
+- `translation`: a `string` feature.
+- `hieroglyphs`: a `string` feature.
+
+
+### Data Splits Sample Size
+
+|   name   |all|
+|----------|----:|
+|plain_text|100708|
+
+## Dataset Creation
+
+### Curation Rationale
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Source Data
+
+The data source comes from the project "Strukturen und Transformationen des Wortschatzes der ägyptischen Sprache" which is compiling an extensively annotated digital corpus of Egyptian texts. Their [publication](https://edoc.bbaw.de/frontdoor/index/index/docId/2919) comprises an excerpt of the internal database's contents.
+
+### Annotations
+
+The corpus has not been preprocessed as we encourage every scientist to prepare the corpus to their desired needs. This means, that all textcritic symbols are still included in the transliteration and translation. This concerns the following annotations:
+
+- `()`: defective
+- `[]`: lost
+- `{}`: surplus
+- `〈〉`: omitted
+- `⸢⸣`: damaged
+- `⸮?`: unclear
+- `{{}}`: erasure
+- `(())`: above
+- `[[]]`: overstrike
+- `〈〈〉〉`: haplography
+
+Their exists a similar sign list for the annotation of the hieroglyphic encoding. If you wish access to this list, please get in contact with the author.
+
+### Personal and Sensitive Information
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Discussion of Biases
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Other Known Limitations
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+## Additional Information
+
+### Dataset Curators
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Licensing Information
+
+[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
+
+### Citation Information
+Source corpus:
+```
+@misc{OPUS4-2919,
+  title     = {Teilauszug der Datenbank des Vorhabens "Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache" vom Januar 2018},
+  institution = {Akademienvorhaben Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache. Text- und Wissenskultur im alten {\"A}gypten},
+  type      = {other},
+  year        = {2018},
+}
+
+```
+
+Translation paper:
+```
+@article{wiesenbach19,
+  title = {Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian Hieroglyphs},
+  author = {Wiesenbach, Philipp and Riezler, Stefan},
+  journal = {Proceedings of the International Workshop on Spoken Language Translation},
+  journal-abbrev = {IWSLT},
+  year = {2019},
+  url = {https://www.cl.uni-heidelberg.de/statnlpgroup/publications/IWSLT2019_v2.pdf}
+}
+```
+
+### Contributions
-### Contributions
+### Contributions
+
+Thanks to [@phiwi](https://github.com/phiwi) for adding this dataset.
-### Contributions
+### Contributions
+
+Thanks to [@phiwi](https://github.com/phiwi) for adding this dataset.
+
+Thanks to [@phiwi](https://github.com/phiwi) for adding this dataset.
diff --git a/datasets/bbaw_egyptian/bbaw_egyptian.py b/datasets/bbaw_egyptian/bbaw_egyptian.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Middle Egyptian dataset as used in the paper  """
+import json
+
+import datasets
+
+
+_CITATION = """\
+@misc{OPUS4-2919,
+title  = {Teilauszug der Datenbank des Vorhabens "Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache" vom Januar 2018},
+institution = {Akademienvorhaben Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache. Text- und Wissenskultur im alten {\"A}gypten},
+type = {other},
+year = {2018},
+}
+"""
+
+_DESCRIPTION = """\
+This dataset comprises parallel sentences of hieroglyphic encodings, transcription and translation
+as used in the paper Multi-Task Modeling of Phonographic Languages: Translating Middle Egyptian
+Hieroglyph. The data triples are extracted from the digital corpus of Egyptian texts compiled by
+the project "Strukturen und Transformationen des Wortschatzes der ägyptischen Sprache".
+"""
+
+_HOMEPAGE = "https://edoc.bbaw.de/frontdoor/index/index/docId/2919"
+
+_LICENSE = "Creative Commons-Lizenz - CC BY-SA - 4.0 International"
+
+
+class BbawEgyptian(datasets.GeneratorBasedBuilder):
+    """
+    The project `Strukturen und Transformationen des Wortschatzes der ägyptischen Sprache`
+    is compiling an extensively annotated digital corpus of Egyptian texts.
+    This publication comprises an excerpt of the internal database's contents.
+    """
+
+    _URL = "https://phiwi.github.io/"
+    _URLS = {"all": _URL + "all.json"}
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "transcription": datasets.Value("string"),
+                "translation": datasets.Value("string"),
+                "hieroglyphs": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            supervised_keys=None,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        my_urls = self._URLS
+        data_dir = dl_manager.download(my_urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": data_dir["all"]},
+            )
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples as (key, example) tuples."""
+        with open(filepath, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        for id_, row in enumerate(data):
+            yield id_, {
+                "translation": row["translation"],
+                "transcription": row["transcription"],
+                "hieroglyphs": row["hieroglyphs"],
+            }
diff --git a/datasets/bbaw_egyptian/dataset_infos.json b/datasets/bbaw_egyptian/dataset_infos.json
@@ -0,0 +1 @@
+{"default": {"description": "The project `Strukturen und Transformationen des Wortschatzes der \u00e4gyptischen Sprache`\nis compiling an extensively annotated digital corpus of Egyptian texts.\nThis publication comprises an excerpt of the internal database's contents.\n", "citation": "@misc{OPUS4-2919,\ntitle     = {Teilauszug der Datenbank des Vorhabens \"Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache\" vom Januar 2018},\ninstitution = {Akademienvorhaben Strukturen und Transformationen des Wortschatzes der {\"a}gyptischen Sprache. Text- und Wissenskultur im alten {\"A}gypten},\ntype      = {other},\nyear        = {2018},\n}\n", "homepage": "https://aaew.bbaw.de/tla/index.html", "license": "Creative Commons-Lizenz - CC BY-SA - 4.0 International", "features": {"transcription": {"dtype": "string", "id": null, "_type": "Value"}, "translation": {"dtype": "string", "id": null, "_type": "Value"}, "hieroglyphs": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "builder_name": "bbaw_egyptian", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 18546162, "num_examples": 100736, "dataset_name": "bbaw_egyptian"}}, "download_checksums": {"https://phiwi.github.io/all.json": {"num_bytes": 35348686, "checksum": "cd2b2396e4c08d96ba035d2efd33c5c1a755f0e45a7e2857a6c9afc0065ca6d9"}}, "download_size": 35348686, "post_processing_size": null, "dataset_size": 18546162, "size_in_bytes": 53894848}}
diff --git a/datasets/bbaw_egyptian/dummy/0.0.0/dummy_data.zip b/datasets/bbaw_egyptian/dummy/0.0.0/dummy_data.zip
diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json
@@ -129,6 +129,7 @@
     "ee": "Ewe",
     "ee-GH": "Ewe (Ghana)",
     "ee-TG": "Ewe (Togo)",
+    "egy": "Egyptian (Ancient)",
     "el": "Greek",
     "el-CY": "Greek (Cyprus)",
     "el-GR": "Greek (Greece)",
@@ -789,4 +790,4 @@
     "zh-Hant-TW": "Chinese (Traditional, Taiwan)",
     "zu": "Zulu",
     "zu-ZA": "Zulu (South Africa)"
-}
+}