kaplanPRO · csengor · Apr 29, 2022 · Apr 15, 2022 · Apr 18, 2022 · Apr 18, 2022
diff --git a/docs/conf.py b/docs/conf.py
@@ -12,7 +12,7 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../kaplan'))
+sys.path.insert(0, os.path.abspath('../'))
 
 
 # -- Project information -----------------------------------------------------
@@ -22,7 +22,7 @@
 author = 'Çağatay Onur Şengör'
 
 # The full version, including alpha/beta/rc tags
-release = '0.14.0'
+release = '0.16.0'
 
 
 # -- General configuration ---------------------------------------------------
@@ -43,7 +43,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '.venv']
 
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/docs/index.rst b/docs/index.rst
@@ -6,3 +6,4 @@ kaplan
    bilingual_files
    translation_memories
    project_exchange
+   tools
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,3 +1,4 @@
-sphinx==4.4.0
+sphinx==4.5.0
 sphinx_rtd_theme
-kaplan
+lxml
+regex
diff --git a/docs/tools.rst b/docs/tools.rst
@@ -0,0 +1,5 @@
+Tools
+=====
+
+.. autoclass:: kaplan.tools.QAChecker
+   :members:
diff --git a/kaplan/__init__.py b/kaplan/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '0.15.1'
+__version__ = '0.16.0'
 
 def can_process(input_file):
     '''

diff --git a/kaplan/kxliff.py b/kaplan/kxliff.py
@@ -654,6 +654,8 @@ def merge_segments(self, list_of_segments):
             list_of_segments: List containing segment IDs.
         '''
 
+        assert len(list_of_segments) > 1, 'list_of_segments contains less than 2 segments.'
+
         def transfer_children(source_parent, target_parent):
             if source_parent.text is not None:
                 if len(target_parent) == 0:
@@ -673,10 +675,18 @@ def transfer_children(source_parent, target_parent):
         segments = []
         segment_ids = []
         for segment_id in list_of_segments:
-            segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap)[0]
+            segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap)
+
+            if segment == []:
+                raise ValueError('Segment #{} does not exist.'.format(segment_id))
+
+            segment = segment[0]
+
+            assert 'locked' not in segment.attrib.get('subState', ''), 'Segment #{} is locked'.format(segment_id)
 
             if translation_unit is None:
                 translation_unit = segment.getparent()
+
             else:
                 assert translation_unit == segment.getparent(), 'Segments are not of the same translation unit.'
 

diff --git a/kaplan/sdlxliff.py b/kaplan/sdlxliff.py
@@ -23,8 +23,13 @@ def gen_translation_units(self, include_segments_wo_id=False):
                     continue
                 seg_defs = self.xml_root.xpath('.//sdl:seg-defs/sdl:seg[@id="{0}"]'.format(segment.attrib['id']), namespaces={'sdl':self.nsmap['sdl']})[0]
                 segment_state = seg_defs.attrib.get('conf', None)
+                segment_lock = seg_defs.attrib.get('locked', 'false').lower() == 'true'
                 if segment_state is not None:
                     segment.attrib['state'] = segment_state.lower()
+                    if segment_lock:
+                        segment.attrib['state'] += '-locked'
+                elif segment_lock:
+                    segment.attrib['state'] = 'locked'
 
             yield translation_unit
 
@@ -39,6 +44,20 @@ def get_translation_units(self, include_segments_wo_id=False):
 
         return translation_units
 
+    def set_segment_lock(self, segment_no, lock=True):
+        '''
+        Sets the lock status for a segment
+
+        Args:
+            segment_no (str or int): The number of the segment.
+            lock (bool): Whether the segment should be locked.
+        '''
+        segment_details = self.xml_root.xpath('.//sdl:seg[@id="{0}"]'.format(segment_no), namespaces={'sdl':self.nsmap['sdl']})[0]
+        if lock:
+            segment_details.attrib['locked'] = 'true'
+        else:
+            segment_details.attrib.pop('locked', None)
+
     def update_segment(self, target_segment, tu_no, segment_no, segment_state, submitted_by):
         '''
         Updates a target segment.

diff --git a/kaplan/tools.py b/kaplan/tools.py
@@ -0,0 +1,149 @@
+import regex
+
+from collections import Counter
+import csv
+import io
+from pathlib import Path
+import string
+import tempfile
+import zipfile
+
+
+class QAChecker:
+    def __init__(self):
+        '''
+        Creates a QAChecker instance.
+        '''
+
+        self.letters = {}
+        self.word_counter = {}
+
+    def build(self, target_segments):
+        target_segments = '\n'.join(target_segments)
+        self.letters = set(filter(lambda x: regex.match('\p{L}', x),
+                                   set(target_segments)))
+
+        self.word_counter = Counter(self.__words(target_segments))
+
+    def check(self, segments: dict):
+        '''
+        Checks a dict of segments.
+
+        Args:
+            segments dict(dict)
+        '''
+        results = {}
+
+        _regex = regex.compile('([\.\!\?\:]+)$')
+
+        for i, segment in segments.items():
+            if segment.get('source', '') == '':
+                continue
+            elif segment.get('target', '') == '':
+                results[i] = [{'level':'info',
+                               'message':'Segment not translated.'}]
+                continue
+
+            segment_results = []
+
+            source = segment['source']
+            target = segment['target']
+
+            if (source[0].lower() == source[0]) != (target[0].lower() == target[0]):
+                segment_results.append({'level':'info',
+                                        'type':'capitalization'})
+
+            source_punctuation = _regex.search(source)
+            target_punctuation = _regex.search(target)
+
+            if (bool(source_punctuation) != bool(target_punctuation) or
+                source_punctuation.groups() != target_punctuation.groups()):
+                segment_results.append({'level':'info',
+                                        'type':'punctuation'})
+
+            if all((self.letters, self.word_counter)):
+                for correction in self.corrections_for_sentence(target):
+                    word = correction['word']
+                    suggestions = correction['suggestions']
+                    segment_results.append({'level':'info',
+                                            'type':'typo',
+                                            'word':word,
+                                            'suggestions':suggestions})
+
+            results[i] = segment_results
+
+        return results
+
+    def __P(self, word):
+        return self.word_counter[word] / sum(self.word_counter.values())
+
+    def corrections(self, word, n=5):
+        return sorted(self.__candidates(word), key=self.__P, reverse=True)[:n]
+
+    def corrections_for_sentence(self, sentence, n=5):
+        for word in self.__words(sentence):
+            if word in self.word_counter:
+                continue
+            yield {'word':word, 'suggestions':self.corrections(word)}
+
+    @classmethod
+    def open(cls, path):
+        with zipfile.ZipFile(path) as zf:
+            letters = set(zf.read('letters.txt').decode('UTF-8').strip())
+
+            with zf.open('word_counter.csv') as csvfile:
+                fieldnames = ['word', 'count']
+                csvreader = csv.DictReader(io.TextIOWrapper(csvfile, 'UTF-8'), fieldnames=fieldnames)
+                word_counter = Counter({row['word']:int(row['count']) for row in csvreader})
+
+        qac = cls()
+        qac.letters = letters
+        qac.word_counter = word_counter
+
+        return qac
+
+    def save(self, path):
+        path = Path(path).with_suffix('.kqac')
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path_letters = Path(tmpdir, 'letters.txt')
+            with open(path_letters, 'w', encoding='UTF-8') as f:
+                for l in self.letters:
+                    f.write(l)
+
+            path_word_counter = Path(tmpdir, 'word_counter.csv')
+            with open(path_word_counter, 'w', encoding='UTF-8') as csvfile:
+                fieldnames = ['word', 'count']
+                csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+                #csvwriter.writeheader()
+                for word, count in self.word_counter.most_common():
+                    csvwriter.writerow({'word':word, 'count':count})
+
+            with zipfile.ZipFile(path, 'w') as zf:
+                zf.write(path_letters, 'letters.txt')
+                zf.write(path_word_counter, 'word_counter.csv')
+
+    def __candidates(self, word, n_edits=2):
+        words = set([word])
+
+        for n in range(n_edits):
+            words.update([edit for edit in self.__edits(word) for word in words])
+
+        return self.__known(words)
+
+    def __known(self, words):
+        return set([w for w in words if w in self.word_counter])
+
+    def __edits(self, word):
+        letters    = self.letters
+        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
+        deletes    = [L + R[1:]               for L, R in splits if R]
+        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
+        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
+        inserts    = [L + c + R               for L, R in splits for c in letters]
+        return set(deletes + transposes + replaces + inserts)
+
+    def __words(self, text):
+        return filter(lambda x: len(x) > 1 and regex.match('^[\p{L}\'-]+$', x),
+                      regex.sub('[^\p{L}\p{N}\s\'-]', '', text).split())
diff --git a/kaplan/xliff.py b/kaplan/xliff.py
@@ -42,6 +42,9 @@ def gen_translation_units(self, include_segments_wo_id=True):
                 for _child in _translation_unit:
                     if not _child.tag.endswith(('}segment', '}ignorable')):
                         _translation_unit.remove(_child)
+                        continue
+                    _child.attrib['state'] = _child.attrib.get('subState', _child.attrib.get('state', 'initial-blank'))
+                    _child.attrib.pop('subState', None)
                 for _any_child in _translation_unit.findall('.//'):
                     if 'equiv' in _any_child.attrib:
                         _any_child.text = html.unescape(_any_child.attrib['equiv'])
@@ -167,6 +170,39 @@ def save(self, output_directory):
                                           encoding='UTF-8',
                                           xml_declaration=True)
 
+    def set_segment_lock(self, segment_no, lock=True):
+        '''
+        Sets the lock status for a segment
+
+        Args:
+            segment_no (str or int): The number of the segment.
+            lock (bool): Whether the segment should be locked.
+        '''
+        if self.xliff_version >= 2.0:
+            segment = self.xml_root.find('.//segment[@id="{0}"]'.format(segment_no), self.nsmap)
+            if segment is None:
+                raise ValueError('Segment #{} does not exists.'.format(segment_no))
+            cur_substate = segment.attrib.get('subState', segment.attrib.get('state', 'initial-blank'))
+            is_locked = cur_substate.lower().endswith('-locked')
+            if (lock and is_locked) or (not lock and not is_locked):
+                pass
+            elif lock and not is_locked:
+                segment.attrib['subState'] = cur_substate + '-locked'
+            elif not lock and is_locked:
+                segment.attrib['subState'] = cur_substate[:-7]
+        else:
+            segment = self.xml_root.find('.//target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap)
+            if segment is None:
+                raise ValueError('Segment #{} does not exists.'.format(segment_no))
+            cur_state = segment.attrib.get('state', 'new')
+            is_locked = cur_state.lower().startswith('x-locked')
+            if (lock and is_locked) or (not lock and not is_locked):
+                pass
+            elif lock and not is_locked:
+                segment.attrib['state'] = 'x-locked-' + cur_state
+            elif not lock and is_locked:
+                segment.attrib['state'] = cur_state[9:]
+
     def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=None, submitted_by=None):
         '''
         Updates a target segment.
@@ -184,6 +220,49 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
 
         assert etree.QName(target_segment).localname == 'target'
 
+        segment = None
+        if self.xliff_version >= 2.0:
+            translation_unit = self.xml_root.find('.//unit[@id="{0}"]'.format(tu_no), self.nsmap)
+            if segment_no:
+                segment = translation_unit.find('segment[@id="{0}"]'.format(segment_no), self.nsmap)
+            else:
+                segment = translation_unit.find('segment', self.nsmap)
+
+            attribute = 'subState'
+        else:
+            translation_unit = self.xml_root.find('.//trans-unit[@id="{0}"]'.format(tu_no), self.nsmap)
+            if segment_no:
+                segment = translation_unit.find('target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap)
+            else:
+                segment = translation_unit.find('target//mrk[@mtype="seg"]', self.nsmap)
+
+            attribute = 'state'
+
+        if segment is None:
+            raise ValueError('Segment does not exist.')
+
+        assert 'locked' not in segment.attrib.get(attribute, ''), 'Segment is locked.'
+
+        segment_substate = None
+        if segment_state:
+            segment_state = segment_state.lower()
+            if self.xliff_version >= 2.0:
+                if segment_state == 'blank':
+                    segment_state = 'initial'
+                    segment_substate = 'initial-blank'
+                elif segment_state == 'draft':
+                    segment_state = 'initial'
+                    segment_substate = 'initial-draft'
+                elif segment_state in ('translated', 'reviewed'):
+                    segment_substate = segment_state
+                else:
+                    segment_substate = segment_state
+                    segment_state = None
+
+            else:
+                if segment_state not in ('new', 'translated', 'signed-off'):
+                    segment_state = 'x-{}'.format(segment_state)
+
         for any_child in target_segment:
             if 'dataref' in any_child.attrib:
                 any_child.attrib['dataRef'] = any_child.attrib.pop('dataref')
@@ -194,7 +273,11 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
         if self.xliff_version < 2.0:
             active_g_tags = []
             for any_child in target_segment:
-                if any_child.tag.endswith('g'):
+                any_child_tag = etree.QName(any_child).localname
+                if any_child_tag not in ('g', 'x', 'bx', 'ex', 'bpt', 'ept', 'ph', 'it', 'mrk'):
+                    raise ValueError('Target has unrecognized child: {}'.format(any_child_tag))
+
+                if any_child_tag == 'g':
                     if any_child.tail is not None:
                         if len(active_g_tags) > 0:
                             active_g_tag = active_g_tags[0]
@@ -263,7 +346,10 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
         else:
             target_segment.tag = '{{{0}}}target'.format(self.nsmap[None])
             for child in target_segment:
-                child.tag = '{{{0}}}{1}'.format(self.nsmap[None], etree.QName(child).localname)
+                child_tag = etree.QName(child).localname
+                if child_tag not in ('ec', 'sc', 'ph'):
+                    raise ValueError('Target has unrecognized child: {}'.format(child_tag))
+                child.tag = '{{{0}}}{1}'.format(self.nsmap[None], child_tag)
 
         _target_segment = deepcopy(target_segment)
 
@@ -277,6 +363,7 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
 
             if segment_state and submitted_by:
                 _segment.attrib['state'] = segment_state
+                _segment.attrib['subState'] = segment_substate
                 _segment.attrib['modified_on'] = datetime.utcnow().isoformat()
                 _segment.attrib['modified_by'] = submitted_by