Merge branch 'master' into iter_archive

huggingface · Oct 14, 2021 · d860b49 · d860b49 · github-actions · Oct 14, 2021
2 parents 5528318 + f839338
commit d860b49
Show file tree

Hide file tree

Showing 59 changed files with 5,720 additions and 232 deletions.
diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
@@ -34,6 +34,7 @@ deploy_doc "master" master
 
 # Example of how to deploy a doc on a certain commit (the commit doesn't have to be on the master branch).
 # The following commit would live on huggingface.co/docs/datasets/v1.0.0
+deploy_doc "38ec259" v1.13.0
 deploy_doc "2c1fc9c" v1.12.1
 deploy_doc "c65dccc" v1.12.0
 deploy_doc "ea7f0b8" v1.11.0

diff --git a/.github/workflows/test-audio.yml b/.github/workflows/test-audio.yml
@@ -0,0 +1,31 @@
+name: Test audio
+
+on:
+  push:
+    branches:
+    - master
+  pull_request:
+    branches:
+    - master
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.6"
+      - name: Install OS dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install libsndfile1 sox
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[tests,audio]
+          pip install pyarrow --upgrade
+      - name: Test audio with pytest
+        run: |
+          HF_SCRIPTS_VERSION=master python -m pytest -n 2 -sv ./tests/features/test_audio.py
diff --git a/datasets/biosses/biosses.py b/datasets/biosses/biosses.py
@@ -67,8 +67,8 @@ class Biosses(datasets.GeneratorBasedBuilder):
     def _info(self):
         features = datasets.Features(
             {
-                "sentence 1": datasets.Value("string"),
-                "sentence 2": datasets.Value("string"),
+                "sentence1": datasets.Value("string"),
+                "sentence2": datasets.Value("string"),
                 "score": datasets.Value("float32"),
             }
         )
@@ -93,7 +93,7 @@ def _generate_examples(self, filepath):
         df = pd.read_csv(filepath, sep="\t", encoding="utf-8")
         for idx, row in df.iterrows():
             yield idx, {
-                "sentence 1": row["sentence1"],
-                "sentence 2": row["sentence2"],
+                "sentence1": row["sentence1"],
+                "sentence2": row["sentence2"],
                 "score": row["score"],
             }
diff --git a/datasets/biosses/dataset_infos.json b/datasets/biosses/dataset_infos.json
@@ -1 +1 @@
-{"default": {"description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).\n", "citation": "@article{souganciouglu2017biosses,\n  title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain},\n  author={So{\\u{g}}anc{\\i}o{\\u{g}}lu, Gizem and {\\\"O}zt{\\\"u}rk, Hakime and {\\\"O}zg{\\\"u}r, Arzucan},\n  journal={Bioinformatics},\n  volume={33},\n  number={14},\n  pages={i49--i58},\n  year={2017},\n  publisher={Oxford University Press}\n}\n", "homepage": "https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html", "license": "BIOSSES is made available under the terms of The GNU Common Public License v.3.0.\n", "features": {"sentence 1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence 2": {"dtype": "string", "id": null, "_type": "Value"}, "score": {"dtype": "float32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "biosses", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32783, "num_examples": 100, "dataset_name": "biosses"}}, "download_checksums": {"https://raw.githubusercontent.com/Markus-Zlabinger/ssts/fce78a649ab90269950aaf44ce20a36e94409392/data/biosses/all.tsv": {"num_bytes": 36324, "checksum": "e0f7b235e4bc9a76ad4bd170bf0da2f449ec6ea677a9a4b5dcb7be6687775906"}}, "download_size": 36324, "post_processing_size": null, "dataset_size": 32783, "size_in_bytes": 69107}}
+{"default": {"description": "BIOSSES is a benchmark dataset for biomedical sentence similarity estimation. The dataset comprises 100 sentence pairs, in which each sentence was selected from the TAC (Text Analysis Conference) Biomedical Summarization Track Training Dataset containing articles from the biomedical domain. The sentence pairs were evaluated by five different human experts that judged their similarity and gave scores ranging from 0 (no relation) to 4 (equivalent).\n", "citation": "@article{souganciouglu2017biosses,\n  title={BIOSSES: a semantic sentence similarity estimation system for the biomedical domain},\n  author={So{\\u{g}}anc{\\i}o{\\u{g}}lu, Gizem and {\\\"O}zt{\\\"u}rk, Hakime and {\\\"O}zg{\\\"u}r, Arzucan},\n  journal={Bioinformatics},\n  volume={33},\n  number={14},\n  pages={i49--i58},\n  year={2017},\n  publisher={Oxford University Press}\n}\n", "homepage": "https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html", "license": "BIOSSES is made available under the terms of The GNU Common Public License v.3.0.\n", "features": {"sentence1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence2": {"dtype": "string", "id": null, "_type": "Value"}, "score": {"dtype": "float32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "biosses", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 32783, "num_examples": 100, "dataset_name": "biosses"}}, "download_checksums": {"https://raw.githubusercontent.com/Markus-Zlabinger/ssts/fce78a649ab90269950aaf44ce20a36e94409392/data/biosses/all.tsv": {"num_bytes": 36324, "checksum": "e0f7b235e4bc9a76ad4bd170bf0da2f449ec6ea677a9a4b5dcb7be6687775906"}}, "download_size": 36324, "post_processing_size": null, "dataset_size": 32783, "size_in_bytes": 69107}}
diff --git a/datasets/biosses/dummy/0.0.0/dummy_data.zip b/datasets/biosses/dummy/0.0.0/dummy_data.zip
diff --git a/datasets/greek_legal_code/README.md b/datasets/greek_legal_code/README.md
@@ -0,0 +1,240 @@
+---
+pretty_name: Greek Legal Code
+annotations_creators:
+- found
+language_creators:
+- found
+languages:
+- el
+licenses:
+- cc-by-4.0
+multilinguality:
+- monolingual
+size_categories:
+- 10K<n<100K
+source_datasets:
+- original
+task_categories:
+- text-classification
+task_ids:
+- multi-class-classification
+- topic-classification
+---
+
+# Dataset Card for Greek Legal Code
+
+## Table of Contents
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+  - [Data Splits](#data-splits)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+  - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Homepage:** https://doi.org/10.5281/zenodo.5528002
+- **Repository:** https://github.com/christospi/glc-nllp-21
+- **Paper:** TBA
+- **Leaderboard:** N/A
+- **Point of Contact:** [Christos Papaloukas](mailto:christospap@di.uoa.gr)
+
+### Dataset Summary
+
+Greek_Legal_Code (GLC) is a dataset consisting of approx. 47k legal resources from Greek legislation. The origin of GLC is “Permanent Greek Legislation Code - Raptarchis”, a collection of Greek legislative documents classified into multi-level (from broader to more specialized) categories.
+
+**Topics**
+
+GLC consists of 47 legislative volumes and each volume corresponds to a main thematic topic. Each volume is divided into thematic sub categories which are called chapters and subsequently, each chapter breaks down to subjects which contain the legal resources. The total number of chapters is 389 while the total number of subjects is 2285, creating an interlinked thematic hierarchy. So, for the upper thematic level (volume) GLC has 47 classes. For the next thematic level (chapter) GLC offers 389 classes and for the inner and last thematic level (subject), GLC has 2285 classes.
+
+GLC classes are divided into three categories for each thematic level: frequent classes, which occur in more than 10 training documents and can be found in all three subsets (training, development and test); few-shot classes which appear in 1 to 10 training documents and also appear in the documents of the development and test sets, and zero-shot classes which appear in the development and/or test, but not in the training documents.
+
+
+### Supported Tasks and Leaderboards
+
+The dataset supports:
+
+**Multi-class Text Classification:** Given the text of a document, a model predicts the corresponding class.
+
+**Few-shot and Zero-shot learning:** As already noted, the classes can be divided into three groups: frequent, few-shot, and zero- shot, depending on whether they were assigned to more than 10, fewer than 10 but at least one, or no training documents, respectively.
+
+| Level | Total | Frequent | Few-Shot (<10) | Zero-Shot |
+|---|---|---|---|---|
+|Volume|47|47|0|0|
+|Chapter|389|333|53|3|
+|Subject|2285|712|1431|142|
+
+### Languages
+
+All documents are written in Greek.
+
+## Dataset Structure
+
+### Data Instances
+
+
+```json
+{
+  "text": "179. ΑΠΟΦΑΣΗ ΥΠΟΥΡΓΟΥ ΜΕΤΑΦΟΡΩΝ ΚΑΙ ΕΠΙΚΟΙΝΩΝΙΩΝ Αριθ. Β-οικ. 68425/4765 της 2/17 Νοεμ. 2000 (ΦΕΚ Β΄ 1404) Τροποποίηση της 42000/2030/81 κοιν. απόφασης του Υπουργού Συγκοινωνιών «Κωδικοποίηση και συμπλήρωση καν. Αποφάσεων» που εκδόθηκαν κατ’ εξουσιοδότηση του Ν.Δ. 102/73 «περί οργανώσεως των δια λεωφορείων αυτοκινήτων εκτελουμένων επιβατικών συγκοινωνιών». ",
+  "volume": 24,  # "ΣΥΓΚΟΙΝΩΝΙΕΣ"
+}
+```
+
+### Data Fields
+
+The following data fields are provided for documents (`train`, `dev`, `test`):
+
+`text`: (**str**)  The full content of each document, which is represented by its `header` and `articles` (i.e., the `main_body`).\
+`label`: (**class label**): Depending on the configurarion, the volume/chapter/subject of the document. For volume-level class it belongs to specifically: ["ΚΟΙΝΩΝΙΚΗ ΠΡΟΝΟΙΑ",
+ "ΓΕΩΡΓΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΡΑΔΙΟΦΩΝΙΑ ΚΑΙ ΤΥΠΟΣ",
+ "ΒΙΟΜΗΧΑΝΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΥΓΕΙΟΝΟΜΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΠΟΛΕΜΙΚΟ ΝΑΥΤΙΚΟ",
+ "ΤΑΧΥΔΡΟΜΕΙΑ - ΤΗΛΕΠΙΚΟΙΝΩΝΙΕΣ",
+ "ΔΑΣΗ ΚΑΙ ΚΤΗΝΟΤΡΟΦΙΑ",
+ "ΕΛΕΓΚΤΙΚΟ ΣΥΝΕΔΡΙΟ ΚΑΙ ΣΥΝΤΑΞΕΙΣ",
+ "ΠΟΛΕΜΙΚΗ ΑΕΡΟΠΟΡΙΑ",
+ "ΝΟΜΙΚΑ ΠΡΟΣΩΠΑ ΔΗΜΟΣΙΟΥ ΔΙΚΑΙΟΥ",
+ "ΝΟΜΟΘΕΣΙΑ ΑΝΩΝΥΜΩΝ ΕΤΑΙΡΕΙΩΝ ΤΡΑΠΕΖΩΝ ΚΑΙ ΧΡΗΜΑΤΙΣΤΗΡΙΩΝ",
+ "ΠΟΛΙΤΙΚΗ ΑΕΡΟΠΟΡΙΑ",
+ "ΕΜΜΕΣΗ ΦΟΡΟΛΟΓΙΑ",
+ "ΚΟΙΝΩΝΙΚΕΣ ΑΣΦΑΛΙΣΕΙΣ",
+ "ΝΟΜΟΘΕΣΙΑ ΔΗΜΩΝ ΚΑΙ ΚΟΙΝΟΤΗΤΩΝ",
+ "ΝΟΜΟΘΕΣΙΑ ΕΠΙΜΕΛΗΤΗΡΙΩΝ ΣΥΝΕΤΑΙΡΙΣΜΩΝ ΚΑΙ ΣΩΜΑΤΕΙΩΝ",
+ "ΔΗΜΟΣΙΑ ΕΡΓΑ",
+ "ΔΙΟΙΚΗΣΗ ΔΙΚΑΙΟΣΥΝΗΣ",
+ "ΑΣΦΑΛΙΣΤΙΚΑ ΤΑΜΕΙΑ",
+ "ΕΚΚΛΗΣΙΑΣΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΕΚΠΑΙΔΕΥΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΔΗΜΟΣΙΟ ΛΟΓΙΣΤΙΚΟ",
+ "ΤΕΛΩΝΕΙΑΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΣΥΓΚΟΙΝΩΝΙΕΣ",
+ "ΕΘΝΙΚΗ ΑΜΥΝΑ",
+ "ΣΤΡΑΤΟΣ ΞΗΡΑΣ",
+ "ΑΓΟΡΑΝΟΜΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΔΗΜΟΣΙΟΙ ΥΠΑΛΛΗΛΟΙ",
+ "ΠΕΡΙΟΥΣΙΑ ΔΗΜΟΣΙΟΥ ΚΑΙ ΝΟΜΙΣΜΑ",
+ "ΟΙΚΟΝΟΜΙΚΗ ΔΙΟΙΚΗΣΗ",
+ "ΛΙΜΕΝΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΑΣΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΠΟΛΙΤΙΚΗ ΔΙΚΟΝΟΜΙΑ",
+ "ΔΙΠΛΩΜΑΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΔΙΟΙΚΗΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΑΜΕΣΗ ΦΟΡΟΛΟΓΙΑ",
+ "ΤΥΠΟΣ ΚΑΙ ΤΟΥΡΙΣΜΟΣ",
+ "ΕΘΝΙΚΗ ΟΙΚΟΝΟΜΙΑ",
+ "ΑΣΤΥΝΟΜΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΑΓΡΟΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΕΡΓΑΤΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΠΟΙΝΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΕΜΠΟΡΙΚΗ ΝΟΜΟΘΕΣΙΑ",
+ "ΕΠΙΣΤΗΜΕΣ ΚΑΙ ΤΕΧΝΕΣ",
+ "ΕΜΠΟΡΙΚΗ ΝΑΥΤΙΛΙΑ",
+ "ΣΥΝΤΑΓΜΑΤΙΚΗ ΝΟΜΟΘΕΣΙΑ"
+ ] \
+
+The labels can also be a the chapter-level or subject-level class it belongs to. Some chapter labels are omitted due to size (389 classes). Some subject labels are also omitted due to size (2285 classes).
+
+### Data Splits
+
+| Split         | No of Documents                         | Avg. words |
+| ------------------- | ------------------------------------  |  --- |
+| Train | 28,536 | 600 |
+|Development | 9,511 | 574 |
+|Test | 9,516 | 595 |
+
+## Dataset Creation
+
+### Curation Rationale
+
+The dataset was curated by Papaloukas et al. (2021) with the hope to support and encourage further research in NLP for the Greek language.
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+The ``Permanent Greek Legislation Code - Raptarchis`` is a thorough catalogue of Greek legislation since the creation of the Greek state in 1834 until 2015. It includes Laws, Royal and Presidential Decrees, Regulations and Decisions, retrieved from the Official Government Gazette, where Greek legislation is published. This collection is one of the official, publicly available sources of classified Greek legislation suitable for classification tasks.
+
+Currently, the original catalogue is publicly offered in MS Word (.doc) format through the portal e-Themis, the legal database and management service of it, under the administration of the Ministry of the Interior (Affairs). E-Themis is primarily focused on providing legislation on a multitude of predefined thematic categories, as described in the catalogue. The main goal is to help users find legislation of interest using the thematic index.
+
+#### Who are the source language producers?
+
+[More Information Needed]
+
+### Annotations
+
+#### Annotation process
+
+[More Information Needed]
+
+#### Who are the annotators?
+
+[More Information Needed]
+
+### Personal and Sensitive Information
+
+The dataset does not include personal or sensitive information.
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed]
+
+### Discussion of Biases
+
+[More Information Needed]
+
+### Other Known Limitations
+
+[More Information Needed]
+
+## Additional Information
+
+### Dataset Curators
+
+Papaloukas et al. (2021)
+
+### Licensing Information
+
+[More Information Needed]
+
+### Citation Information
+
+*Christos Papaloukas, Ilias Chalkidis, Konstantinos Athinaios, Despina-Athanasia Pantazi and Manolis Koubarakis.*
+*Multi-granular Legal Topic Classification on Greek Legislation.*
+*Proceedings of the 3rd Natural Legal Language Processing (NLLP) Workshop, Punta Cana, Dominican Republic, 2021*
+```
+@inproceedings{papaloukas-etal-2021-glc,
+    title = "Multi-granular Legal Topic Classification on Greek Legislation",
+    author = "Papaloukas, Christos and Chalkidis, Ilias and Athinaios, Konstantinos and Pantazi, Despina-Athanasia and Koubarakis, Manolis",
+    booktitle = "Proceedings of the 3rd Natural Legal Language Processing (NLLP) Workshop",
+    year = "2021",
+    address = "Punta Cana, Dominican Republic",
+    publisher = "",
+    url = "https://arxiv.org/abs/2109.15298",
+    doi = "",
+    pages = ""
+}
+```
+
+### Contributions
+
+Thanks to [@christospi](https://github.com/christospi) for adding this dataset.
diff --git a/datasets/greek_legal_code/dataset_infos.json b/datasets/greek_legal_code/dataset_infos.json
diff --git a/datasets/greek_legal_code/dummy/chapter/1.0.0/dummy_data.zip b/datasets/greek_legal_code/dummy/chapter/1.0.0/dummy_data.zip
diff --git a/datasets/greek_legal_code/dummy/subject/1.0.0/dummy_data.zip b/datasets/greek_legal_code/dummy/subject/1.0.0/dummy_data.zip
diff --git a/datasets/greek_legal_code/dummy/volume/1.0.0/dummy_data.zip b/datasets/greek_legal_code/dummy/volume/1.0.0/dummy_data.zip