Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#
import os
import sys
sys.path.insert(0, os.path.abspath('../kaplan'))
sys.path.insert(0, os.path.abspath('../'))


# -- Project information -----------------------------------------------------
Expand All @@ -22,7 +22,7 @@
author = 'Çağatay Onur Şengör'

# The full version, including alpha/beta/rc tags
release = '0.14.0'
release = '0.16.0'


# -- General configuration ---------------------------------------------------
Expand All @@ -43,7 +43,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '.venv']


# -- Options for HTML output -------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ kaplan
bilingual_files
translation_memories
project_exchange
tools
5 changes: 3 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sphinx==4.4.0
sphinx==4.5.0
sphinx_rtd_theme
kaplan
lxml
regex
5 changes: 5 additions & 0 deletions docs/tools.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Tools
=====

.. autoclass:: kaplan.tools.QAChecker
:members:
2 changes: 1 addition & 1 deletion kaplan/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.15.1'
__version__ = '0.16.0'

def can_process(input_file):
'''
Expand Down
12 changes: 11 additions & 1 deletion kaplan/kxliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,8 @@ def merge_segments(self, list_of_segments):
list_of_segments: List containing segment IDs.
'''

assert len(list_of_segments) > 1, 'list_of_segments contains less than 2 segments.'

def transfer_children(source_parent, target_parent):
if source_parent.text is not None:
if len(target_parent) == 0:
Expand All @@ -673,10 +675,18 @@ def transfer_children(source_parent, target_parent):
segments = []
segment_ids = []
for segment_id in list_of_segments:
segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap)[0]
segment = self.xml_root.xpath('.//xliff:segment[@id="{0}"]'.format(str(segment_id)), namespaces=nsmap)

if segment == []:
raise ValueError('Segment #{} does not exist.'.format(segment_id))

segment = segment[0]

assert 'locked' not in segment.attrib.get('subState', ''), 'Segment #{} is locked'.format(segment_id)

if translation_unit is None:
translation_unit = segment.getparent()

else:
assert translation_unit == segment.getparent(), 'Segments are not of the same translation unit.'

Expand Down
19 changes: 19 additions & 0 deletions kaplan/sdlxliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ def gen_translation_units(self, include_segments_wo_id=False):
continue
seg_defs = self.xml_root.xpath('.//sdl:seg-defs/sdl:seg[@id="{0}"]'.format(segment.attrib['id']), namespaces={'sdl':self.nsmap['sdl']})[0]
segment_state = seg_defs.attrib.get('conf', None)
segment_lock = seg_defs.attrib.get('locked', 'false').lower() == 'true'
if segment_state is not None:
segment.attrib['state'] = segment_state.lower()
if segment_lock:
segment.attrib['state'] += '-locked'
elif segment_lock:
segment.attrib['state'] = 'locked'

yield translation_unit

Expand All @@ -39,6 +44,20 @@ def get_translation_units(self, include_segments_wo_id=False):

return translation_units

def set_segment_lock(self, segment_no, lock=True):
'''
Sets the lock status for a segment

Args:
segment_no (str or int): The number of the segment.
lock (bool): Whether the segment should be locked.
'''
segment_details = self.xml_root.xpath('.//sdl:seg[@id="{0}"]'.format(segment_no), namespaces={'sdl':self.nsmap['sdl']})[0]
if lock:
segment_details.attrib['locked'] = 'true'
else:
segment_details.attrib.pop('locked', None)

def update_segment(self, target_segment, tu_no, segment_no, segment_state, submitted_by):
'''
Updates a target segment.
Expand Down
149 changes: 149 additions & 0 deletions kaplan/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import regex

from collections import Counter
import csv
import io
from pathlib import Path
import string
import tempfile
import zipfile


class QAChecker:
def __init__(self):
'''
Creates a QAChecker instance.
'''

self.letters = {}
self.word_counter = {}

def build(self, target_segments):
target_segments = '\n'.join(target_segments)
self.letters = set(filter(lambda x: regex.match('\p{L}', x),
set(target_segments)))

self.word_counter = Counter(self.__words(target_segments))

def check(self, segments: dict):
'''
Checks a dict of segments.

Args:
segments dict(dict)
'''
results = {}

_regex = regex.compile('([\.\!\?\:]+)$')

for i, segment in segments.items():
if segment.get('source', '') == '':
continue
elif segment.get('target', '') == '':
results[i] = [{'level':'info',
'message':'Segment not translated.'}]
continue

segment_results = []

source = segment['source']
target = segment['target']

if (source[0].lower() == source[0]) != (target[0].lower() == target[0]):
segment_results.append({'level':'info',
'type':'capitalization'})

source_punctuation = _regex.search(source)
target_punctuation = _regex.search(target)

if (bool(source_punctuation) != bool(target_punctuation) or
source_punctuation.groups() != target_punctuation.groups()):
segment_results.append({'level':'info',
'type':'punctuation'})

if all((self.letters, self.word_counter)):
for correction in self.corrections_for_sentence(target):
word = correction['word']
suggestions = correction['suggestions']
segment_results.append({'level':'info',
'type':'typo',
'word':word,
'suggestions':suggestions})

results[i] = segment_results

return results

def __P(self, word):
return self.word_counter[word] / sum(self.word_counter.values())

def corrections(self, word, n=5):
return sorted(self.__candidates(word), key=self.__P, reverse=True)[:n]

def corrections_for_sentence(self, sentence, n=5):
for word in self.__words(sentence):
if word in self.word_counter:
continue
yield {'word':word, 'suggestions':self.corrections(word)}

@classmethod
def open(cls, path):
with zipfile.ZipFile(path) as zf:
letters = set(zf.read('letters.txt').decode('UTF-8').strip())

with zf.open('word_counter.csv') as csvfile:
fieldnames = ['word', 'count']
csvreader = csv.DictReader(io.TextIOWrapper(csvfile, 'UTF-8'), fieldnames=fieldnames)
word_counter = Counter({row['word']:int(row['count']) for row in csvreader})

qac = cls()
qac.letters = letters
qac.word_counter = word_counter

return qac

def save(self, path):
path = Path(path).with_suffix('.kqac')

with tempfile.TemporaryDirectory() as tmpdir:
path_letters = Path(tmpdir, 'letters.txt')
with open(path_letters, 'w', encoding='UTF-8') as f:
for l in self.letters:
f.write(l)

path_word_counter = Path(tmpdir, 'word_counter.csv')
with open(path_word_counter, 'w', encoding='UTF-8') as csvfile:
fieldnames = ['word', 'count']
csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)

#csvwriter.writeheader()
for word, count in self.word_counter.most_common():
csvwriter.writerow({'word':word, 'count':count})

with zipfile.ZipFile(path, 'w') as zf:
zf.write(path_letters, 'letters.txt')
zf.write(path_word_counter, 'word_counter.csv')

def __candidates(self, word, n_edits=2):
words = set([word])

for n in range(n_edits):
words.update([edit for edit in self.__edits(word) for word in words])

return self.__known(words)

def __known(self, words):
return set([w for w in words if w in self.word_counter])

def __edits(self, word):
letters = self.letters
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)

def __words(self, text):
return filter(lambda x: len(x) > 1 and regex.match('^[\p{L}\'-]+$', x),
regex.sub('[^\p{L}\p{N}\s\'-]', '', text).split())
91 changes: 89 additions & 2 deletions kaplan/xliff.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ def gen_translation_units(self, include_segments_wo_id=True):
for _child in _translation_unit:
if not _child.tag.endswith(('}segment', '}ignorable')):
_translation_unit.remove(_child)
continue
_child.attrib['state'] = _child.attrib.get('subState', _child.attrib.get('state', 'initial-blank'))
_child.attrib.pop('subState', None)
for _any_child in _translation_unit.findall('.//'):
if 'equiv' in _any_child.attrib:
_any_child.text = html.unescape(_any_child.attrib['equiv'])
Expand Down Expand Up @@ -167,6 +170,39 @@ def save(self, output_directory):
encoding='UTF-8',
xml_declaration=True)

def set_segment_lock(self, segment_no, lock=True):
'''
Sets the lock status for a segment

Args:
segment_no (str or int): The number of the segment.
lock (bool): Whether the segment should be locked.
'''
if self.xliff_version >= 2.0:
segment = self.xml_root.find('.//segment[@id="{0}"]'.format(segment_no), self.nsmap)
if segment is None:
raise ValueError('Segment #{} does not exists.'.format(segment_no))
cur_substate = segment.attrib.get('subState', segment.attrib.get('state', 'initial-blank'))
is_locked = cur_substate.lower().endswith('-locked')
if (lock and is_locked) or (not lock and not is_locked):
pass
elif lock and not is_locked:
segment.attrib['subState'] = cur_substate + '-locked'
elif not lock and is_locked:
segment.attrib['subState'] = cur_substate[:-7]
else:
segment = self.xml_root.find('.//target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap)
if segment is None:
raise ValueError('Segment #{} does not exists.'.format(segment_no))
cur_state = segment.attrib.get('state', 'new')
is_locked = cur_state.lower().startswith('x-locked')
if (lock and is_locked) or (not lock and not is_locked):
pass
elif lock and not is_locked:
segment.attrib['state'] = 'x-locked-' + cur_state
elif not lock and is_locked:
segment.attrib['state'] = cur_state[9:]

def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=None, submitted_by=None):
'''
Updates a target segment.
Expand All @@ -184,6 +220,49 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N

assert etree.QName(target_segment).localname == 'target'

segment = None
if self.xliff_version >= 2.0:
translation_unit = self.xml_root.find('.//unit[@id="{0}"]'.format(tu_no), self.nsmap)
if segment_no:
segment = translation_unit.find('segment[@id="{0}"]'.format(segment_no), self.nsmap)
else:
segment = translation_unit.find('segment', self.nsmap)

attribute = 'subState'
else:
translation_unit = self.xml_root.find('.//trans-unit[@id="{0}"]'.format(tu_no), self.nsmap)
if segment_no:
segment = translation_unit.find('target//mrk[@mid="{0}"][@mtype="seg"]'.format(segment_no), self.nsmap)
else:
segment = translation_unit.find('target//mrk[@mtype="seg"]', self.nsmap)

attribute = 'state'

if segment is None:
raise ValueError('Segment does not exist.')

assert 'locked' not in segment.attrib.get(attribute, ''), 'Segment is locked.'

segment_substate = None
if segment_state:
segment_state = segment_state.lower()
if self.xliff_version >= 2.0:
if segment_state == 'blank':
segment_state = 'initial'
segment_substate = 'initial-blank'
elif segment_state == 'draft':
segment_state = 'initial'
segment_substate = 'initial-draft'
elif segment_state in ('translated', 'reviewed'):
segment_substate = segment_state
else:
segment_substate = segment_state
segment_state = None

else:
if segment_state not in ('new', 'translated', 'signed-off'):
segment_state = 'x-{}'.format(segment_state)

for any_child in target_segment:
if 'dataref' in any_child.attrib:
any_child.attrib['dataRef'] = any_child.attrib.pop('dataref')
Expand All @@ -194,7 +273,11 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
if self.xliff_version < 2.0:
active_g_tags = []
for any_child in target_segment:
if any_child.tag.endswith('g'):
any_child_tag = etree.QName(any_child).localname
if any_child_tag not in ('g', 'x', 'bx', 'ex', 'bpt', 'ept', 'ph', 'it', 'mrk'):
raise ValueError('Target has unrecognized child: {}'.format(any_child_tag))

if any_child_tag == 'g':
if any_child.tail is not None:
if len(active_g_tags) > 0:
active_g_tag = active_g_tags[0]
Expand Down Expand Up @@ -263,7 +346,10 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N
else:
target_segment.tag = '{{{0}}}target'.format(self.nsmap[None])
for child in target_segment:
child.tag = '{{{0}}}{1}'.format(self.nsmap[None], etree.QName(child).localname)
child_tag = etree.QName(child).localname
if child_tag not in ('ec', 'sc', 'ph'):
raise ValueError('Target has unrecognized child: {}'.format(child_tag))
child.tag = '{{{0}}}{1}'.format(self.nsmap[None], child_tag)

_target_segment = deepcopy(target_segment)

Expand All @@ -277,6 +363,7 @@ def update_segment(self, target_segment, tu_no, segment_no=None, segment_state=N

if segment_state and submitted_by:
_segment.attrib['state'] = segment_state
_segment.attrib['subState'] = segment_substate
_segment.attrib['modified_on'] = datetime.utcnow().isoformat()
_segment.attrib['modified_by'] = submitted_by

Expand Down