diff --git a/docs/conf.py b/docs/conf.py index 28a90b0..c82d854 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,7 +12,7 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../kaplan')) +sys.path.insert(0, os.path.abspath('../')) # -- Project information ----------------------------------------------------- @@ -22,7 +22,7 @@ author = 'Çağatay Onur Şengör' # The full version, including alpha/beta/rc tags -release = '0.14.0' +release = '0.16.0' # -- General configuration --------------------------------------------------- @@ -43,7 +43,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '.venv'] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index b20c457..142c58d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,3 +6,4 @@ kaplan bilingual_files translation_memories project_exchange + tools diff --git a/docs/requirements.txt b/docs/requirements.txt index 6551aa9..a7ca464 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ -sphinx==4.4.0 +sphinx==4.5.0 sphinx_rtd_theme -kaplan +lxml +regex diff --git a/docs/tools.rst b/docs/tools.rst new file mode 100644 index 0000000..ae6fd06 --- /dev/null +++ b/docs/tools.rst @@ -0,0 +1,5 @@ +Tools +===== + +.. autoclass:: kaplan.tools.QAChecker + :members: diff --git a/kaplan/__init__.py b/kaplan/__init__.py index 14bf9cc..6d3f750 100644 --- a/kaplan/__init__.py +++ b/kaplan/__init__.py @@ -1,4 +1,4 @@ -__version__ = '0.15.1' +__version__ = '0.16.0' def can_process(input_file): ''' diff --git a/kaplan/kxliff.py b/kaplan/kxliff.py index 6c30617..f85e4e5 100644 --- a/kaplan/kxliff.py +++ b/kaplan/kxliff.py @@ -654,6 +654,8 @@ def merge_segments(self, list_of_segments): list_of_segments: List containing segment IDs. ''' + assert len(list_of_segments) > 1, 'list_of_segments contains less than 2 segments.' + def transfer_children(source_parent, target_parent): if source_parent.text is not None: if len(target_parent) == 0: @@ -673,10 +675,18 @@ def transfer_children(source_parent, target_parent): segments = [] segment_ids = [] for segment_id in list_of_segments: - segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap)[0] + segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap) + + if segment == []: + raise ValueError('Segment #{} does not exist.'.format(segment_id)) + + segment = segment[0] + + assert 'locked' not in segment.attrib.get('subState', ''), 'Segment #{} is locked'.format(segment_id) if translation_unit is None: translation_unit = segment.getparent() + else: assert translation_unit == segment.getparent(), 'Segments are not of the same translation unit.' diff --git a/kaplan/sdlxliff.py b/kaplan/sdlxliff.py index d52e2da..8442d7f 100644 --- a/kaplan/sdlxliff.py +++ b/kaplan/sdlxliff.py @@ -23,8 +23,13 @@ def gen_translation_units(self, include_segments_wo_id=False): continue seg_defs = self.xml_root.xpath('.//sdl:seg-defs/sdl:seg[@id="{0}"]'.format(segment.attrib['id']), namespaces={'sdl':self.nsmap['sdl']})[0] segment_state = seg_defs.attrib.get('conf', None) + segment_lock = seg_defs.attrib.get('locked', 'false').lower() == 'true' if segment_state is not None: segment.attrib['state'] = segment_state.lower() + if segment_lock: + segment.attrib['state'] += '-locked' + elif segment_lock: + segment.attrib['state'] = 'locked' yield translation_unit @@ -39,6 +44,20 @@ def get_translation_units(self, include_segments_wo_id=False): return translation_units + def set_segment_lock(self, segment_no, lock=True): + ''' + Sets the lock status for a segment + + Args: + segment_no (str or int): The number of the segment. + lock (bool): Whether the segment should be locked. + ''' + segment_details = self.xml_root.xpath('.//sdl:seg[@id="{0}"]'.format(segment_no), namespaces={'sdl':self.nsmap['sdl']})[0] + if lock: + segment_details.attrib['locked'] = 'true' + else: + segment_details.attrib.pop('locked', None) + def update_segment(self, target_segment, tu_no, segment_no, segment_state, submitted_by): ''' Updates a target segment. diff --git a/kaplan/tools.py b/kaplan/tools.py new file mode 100644 index 0000000..ce91a49 --- /dev/null +++ b/kaplan/tools.py @@ -0,0 +1,149 @@ +import regex + +from collections import Counter +import csv +import io +from pathlib import Path +import string +import tempfile +import zipfile + + +class QAChecker: + def __init__(self): + ''' + Creates a QAChecker instance. + ''' + + self.letters = {} + self.word_counter = {} + + def build(self, target_segments): + target_segments = '\n'.join(target_segments) + self.letters = set(filter(lambda x: regex.match('\p{L}', x), + set(target_segments))) + + self.word_counter = Counter(self.__words(target_segments)) + + def check(self, segments: dict): + ''' + Checks a dict of segments. + + Args: + segments dict(dict) + ''' + results = {} + + _regex = regex.compile('([\.\!\?\:]+)$') + + for i, segment in segments.items(): + if segment.get('source', '') == '': + continue + elif segment.get('target', '') == '': + results[i] = [{'level':'info', + 'message':'Segment not translated.'}] + continue + + segment_results = [] + + source = segment['source'] + target = segment['target'] + + if (source[0].lower() == source[0]) != (target[0].lower() == target[0]): + segment_results.append({'level':'info', + 'type':'capitalization'}) + + source_punctuation = _regex.search(source) + target_punctuation = _regex.search(target) + + if (bool(source_punctuation) != bool(target_punctuation) or + source_punctuation.groups() != target_punctuation.groups()): + segment_results.append({'level':'info', + 'type':'punctuation'}) + + if all((self.letters, self.word_counter)): + for correction in self.corrections_for_sentence(target): + word = correction['word'] + suggestions = correction['suggestions'] + segment_results.append({'level':'info', + 'type':'typo', + 'word':word, + 'suggestions':suggestions}) + + results[i] = segment_results + + return results + + def __P(self, word): + return self.word_counter[word] / sum(self.word_counter.values()) + + def corrections(self, word, n=5): + return sorted(self.__candidates(word), key=self.__P, reverse=True)[:n] + + def corrections_for_sentence(self, sentence, n=5): + for word in self.__words(sentence): + if word in self.word_counter: + continue + yield {'word':word, 'suggestions':self.corrections(word)} + + @classmethod + def open(cls, path): + with zipfile.ZipFile(path) as zf: + letters = set(zf.read('letters.txt').decode('UTF-8').strip()) + + with zf.open('word_counter.csv') as csvfile: + fieldnames = ['word', 'count'] + csvreader = csv.DictReader(io.TextIOWrapper(csvfile, 'UTF-8'), fieldnames=fieldnames) + word_counter = Counter({row['word']:int(row['count']) for row in csvreader}) + + qac = cls() + qac.letters = letters + qac.word_counter = word_counter + + return qac + + def save(self, path): + path = Path(path).with_suffix('.kqac') + + with tempfile.TemporaryDirectory() as tmpdir: + path_letters = Path(tmpdir, 'letters.txt') + with open(path_letters, 'w', encoding='UTF-8') as f: + for l in self.letters: + f.write(l) + + path_word_counter = Path(tmpdir, 'word_counter.csv') + with open(path_word_counter, 'w', encoding='UTF-8') as csvfile: + fieldnames = ['word', 'count'] + csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames) + + #csvwriter.writeheader() + for word, count in self.word_counter.most_common(): + csvwriter.writerow({'word':word, 'count':count}) + + with zipfile.ZipFile(path, 'w') as zf: + zf.write(path_letters, 'letters.txt') + zf.write(path_word_counter, 'word_counter.csv') + + def __candidates(self, word, n_edits=2): + words = set([word]) + + for n in range(n_edits): + words.update([edit for edit in self.__edits(word) for word in words]) + + return self.__known(words) + + def __known(self, words): + return set([w for w in words if w in self.word_counter]) + + def __edits(self, word): + letters = self.letters + splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] + deletes = [L + R[1:] for L, R in splits if R] + transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] + replaces = [L + c + R[1:] for L, R in splits if R for c in letters] + inserts = [L + c + R for L, R in splits for c in letters] + return set(deletes + transposes + replaces + inserts) + + def __words(self, text): + return filter(lambda x: len(x) > 1 and regex.match('^[\p{L}\'-]+$', x), + regex.sub('[^\p{L}\p{N}\s\'-]', '', text).split()) diff --git a/kaplan/xliff.py b/kaplan/xliff.py index 16caa5f..984fece 100644 --- a/kaplan/xliff.py +++ b/kaplan/xliff.py @@ -42,6 +42,9 @@ def gen_translation_units(self, include_segments_wo_id=True): for _child in _translation_unit: if not _child.tag.endswith(('}segment', '}ignorable')): _translation_unit.remove(_child) + continue + _child.attrib['state'] = _child.attrib.get('subState', _child.attrib.get('state', 'initial-blank')) + _child.attrib.pop('subState', None) for _any_child in _translation_unit.findall('.//'): if 'equiv' in _any_child.attrib: _any_child.text = html.unescape(_any_child.attrib['equiv']) @@ -167,6 +170,39 @@ def save(self, output_directory): encoding='UTF-8', xml_declaration=True) + def set_segment_lock(self, segment_no, lock=True): + ''' + Sets the lock status for a segment + + Args: + segment_no (str or int): The number of the segment. + lock (bool): Whether the segment should be locked. + ''' + if self.xliff_version >= 2.0: + segment = self.xml_root.find('.//segment[@id="{0}"]'.format(segment_no), self.nsmap) + if segment is None: + raise ValueError('Segment #{} does not exists.'.format(segment_no)) + cur_substate = segment.attrib.get('subState', segment.attrib.get('state', 'initial-blank')) + is_locked = cur_substate.lower().endswith('-locked') + if (lock and is_locked) or (not lock and not is_locked): + pass + elif lock and not is_locked: + segment.attrib['subState'] = cur_substate + '-locked' + elif not lock and is_locked: + segment.attrib['subState'] = cur_substate[:-7] + else: + segment = self.xml_root.find('.//target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap) + if segment is None: + raise ValueError('Segment #{} does not exists.'.format(segment_no)) + cur_state = segment.attrib.get('state', 'new') + is_locked = cur_state.lower().startswith('x-locked') + if (lock and is_locked) or (not lock and not is_locked): + pass + elif lock and not is_locked: + segment.attrib['state'] = 'x-locked-' + cur_state + elif not lock and is_locked: + segment.attrib['state'] = cur_state[9:] + def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=None, submitted_by=None): ''' Updates a target segment. @@ -184,6 +220,49 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N assert etree.QName(target_segment).localname == 'target' + segment = None + if self.xliff_version >= 2.0: + translation_unit = self.xml_root.find('.//unit[@id="{0}"]'.format(tu_no), self.nsmap) + if segment_no: + segment = translation_unit.find('segment[@id="{0}"]'.format(segment_no), self.nsmap) + else: + segment = translation_unit.find('segment', self.nsmap) + + attribute = 'subState' + else: + translation_unit = self.xml_root.find('.//trans-unit[@id="{0}"]'.format(tu_no), self.nsmap) + if segment_no: + segment = translation_unit.find('target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap) + else: + segment = translation_unit.find('target//mrk[@mtype="seg"]', self.nsmap) + + attribute = 'state' + + if segment is None: + raise ValueError('Segment does not exist.') + + assert 'locked' not in segment.attrib.get(attribute, ''), 'Segment is locked.' + + segment_substate = None + if segment_state: + segment_state = segment_state.lower() + if self.xliff_version >= 2.0: + if segment_state == 'blank': + segment_state = 'initial' + segment_substate = 'initial-blank' + elif segment_state == 'draft': + segment_state = 'initial' + segment_substate = 'initial-draft' + elif segment_state in ('translated', 'reviewed'): + segment_substate = segment_state + else: + segment_substate = segment_state + segment_state = None + + else: + if segment_state not in ('new', 'translated', 'signed-off'): + segment_state = 'x-{}'.format(segment_state) + for any_child in target_segment: if 'dataref' in any_child.attrib: any_child.attrib['dataRef'] = any_child.attrib.pop('dataref') @@ -194,7 +273,11 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N if self.xliff_version < 2.0: active_g_tags = [] for any_child in target_segment: - if any_child.tag.endswith('g'): + any_child_tag = etree.QName(any_child).localname + if any_child_tag not in ('g', 'x', 'bx', 'ex', 'bpt', 'ept', 'ph', 'it', 'mrk'): + raise ValueError('Target has unrecognized child: {}'.format(any_child_tag)) + + if any_child_tag == 'g': if any_child.tail is not None: if len(active_g_tags) > 0: active_g_tag = active_g_tags[0] @@ -263,7 +346,10 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N else: target_segment.tag = '{{{0}}}target'.format(self.nsmap[None]) for child in target_segment: - child.tag = '{{{0}}}{1}'.format(self.nsmap[None], etree.QName(child).localname) + child_tag = etree.QName(child).localname + if child_tag not in ('ec', 'sc', 'ph'): + raise ValueError('Target has unrecognized child: {}'.format(child_tag)) + child.tag = '{{{0}}}{1}'.format(self.nsmap[None], child_tag) _target_segment = deepcopy(target_segment) @@ -277,6 +363,7 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N if segment_state and submitted_by: _segment.attrib['state'] = segment_state + _segment.attrib['subState'] = segment_substate _segment.attrib['modified_on'] = datetime.utcnow().isoformat() _segment.attrib['modified_by'] = submitted_by