/
translation.py
130 lines (102 loc) 路 4.44 KB
/
translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union
import pyarrow as pa
if TYPE_CHECKING:
from .features import FeatureType
@dataclass
class Translation:
"""`FeatureConnector` for translations with fixed languages per example.
Here for compatiblity with tfds.
Input: The Translate feature accepts a dictionary for each example mapping
string language codes to string translations.
Output: A dictionary mapping string language codes to translations as `Text`
features.
Example:
```python
>>> # At construction time:
>>> datasets.features.Translation(languages=['en', 'fr', 'de'])
>>> # During data generation:
>>> yield {
... 'en': 'the cat',
... 'fr': 'le chat',
... 'de': 'die katze'
... }
```
"""
languages: List[str]
id: Optional[str] = None
# Automatically constructed
dtype: ClassVar[str] = "dict"
pa_type: ClassVar[Any] = None
_type: str = field(default="Translation", init=False, repr=False)
def __call__(self):
return pa.struct({lang: pa.string() for lang in sorted(self.languages)})
def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
"""Flatten the Translation feature into a dictionary."""
from .features import Value
return {k: Value("string") for k in sorted(self.languages)}
@dataclass
class TranslationVariableLanguages:
"""`FeatureConnector` for translations with variable languages per example.
Here for compatiblity with tfds.
Input: The TranslationVariableLanguages feature accepts a dictionary for each
example mapping string language codes to one or more string translations.
The languages present may vary from example to example.
Output:
language: variable-length 1D tf.Tensor of tf.string language codes, sorted
in ascending order.
translation: variable-length 1D tf.Tensor of tf.string plain text
translations, sorted to align with language codes.
Example:
```python
>>> # At construction time:
>>> datasets.features.TranslationVariableLanguages(languages=['en', 'fr', 'de'])
>>> # During data generation:
>>> yield {
... 'en': 'the cat',
... 'fr': ['le chat', 'la chatte,']
... 'de': 'die katze'
... }
>>> # Tensor returned :
>>> {
... 'language': ['en', 'de', 'fr', 'fr'],
... 'translation': ['the cat', 'die katze', 'la chatte', 'le chat'],
... }
```
"""
languages: Optional[List] = None
num_languages: Optional[int] = None
id: Optional[str] = None
# Automatically constructed
dtype: ClassVar[str] = "dict"
pa_type: ClassVar[Any] = None
_type: str = field(default="TranslationVariableLanguages", init=False, repr=False)
def __post_init__(self):
self.languages = list(sorted(list(set(self.languages)))) if self.languages else None
self.num_languages = len(self.languages) if self.languages else None
def __call__(self):
return pa.struct({"language": pa.list_(pa.string()), "translation": pa.list_(pa.string())})
def encode_example(self, translation_dict):
lang_set = set(self.languages)
if self.languages and set(translation_dict) - lang_set:
raise ValueError(
f'Some languages in example ({", ".join(sorted(set(translation_dict) - lang_set))}) are not in valid set ({", ".join(lang_set)}).'
)
# Convert dictionary into tuples, splitting out cases where there are
# multiple translations for a single language.
translation_tuples = []
for lang, text in translation_dict.items():
if isinstance(text, str):
translation_tuples.append((lang, text))
else:
translation_tuples.extend([(lang, el) for el in text])
# Ensure translations are in ascending order by language code.
languages, translations = zip(*sorted(translation_tuples))
return {"language": languages, "translation": translations}
def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]:
"""Flatten the TranslationVariableLanguages feature into a dictionary."""
from .features import Sequence, Value
return {
"language": Sequence(Value("string")),
"translation": Sequence(Value("string")),
}