In [None]:
from pymystem3 import Mystem
from rnnmorph.predictor import RNNMorphPredictor
import glob
import json
from pyknp import KNP, DrawTree, BList, Juman
from dataclasses import dataclass, asdict
import pykakasi

knp = KNP()
transliterator = pykakasi.kakasi()
transliterator.setMode("H","a")
transliterator.setMode("K","a")
transliterator.setMode("r","Hepburn")
converter = transliterator.getConverter()
jumanpp = Juman()

rnnm = RNNMorphPredictor(language="ru")
mystem = Mystem(
        mystem_bin=None,
        grammar_info=True,
        disambiguation=True,
        entire_input=True,
#         glue_grammar_info=True,
#         weight=False,
         generate_all=True,
         no_bastards=False,
#         end_of_sentence=False,
#         fixlist=None,
        use_english_names=True
)

  from collections import Mapping, defaultdict
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            print(obj)
            return list(obj)
        return json.JSONEncoder.default(self, obj)

@dataclass
class Sentence:
    id: int
    sent_ru: str
    sent_jp: str

@dataclass
class Ids:
    start: int
    end: int

@dataclass
class Entry:
    id: int
    title_ru: str
    title_jp: str
    link_ru: str
    link_jp: str
    contents: [Sentence]
    join_ids_ru: [Ids]
    join_ids_jp: [Ids]
    erase_ids_ru: [Ids]
    erase_ids_jp: [Ids]

    @staticmethod
    def from_json(filename: str, encoding: str = 'utf-8'):
        with open(filename, 'r', encoding=encoding) as f:
            text = f.read()
            obj = json.loads(text)
            return Entry(**obj)

    def to_json(self, filename: str, encoding: str = 'utf-8'):
        with open(filename, 'w', encoding=encoding) as f:
            json.dump(asdict(self), f, ensure_ascii=False)

@dataclass
class jaToken:
    id: int
    word: str
    reading: str
    lexeme: str
    normalized_lexeme: str
    pos: str
    pos_extended: str
    semantic_info: {str: str}

@dataclass
class ProcessedEntry:
    id: int
    title_ru: {}
    title_jp: {}
    link_ru: str
    link_jp: str
    contents: {}

    @staticmethod
    def from_json(filename: str, encoding: str = 'utf-8'):
        with open(filename, 'r', encoding=encoding) as f:
            text = f.read()
            obj = json.loads(text)
            return Entry(**obj)

    def to_json(self, filename: str, encoding: str = 'utf-8'):
        with open(filename, 'w', encoding=encoding) as f:
            json.dump(asdict(self), f, ensure_ascii=False, cls=SetEncoder)

def normalize_lexeme(lexeme, sem_str):
    res_list = []
    res_list.extend(sem_str.split()[0].split(':'))
    if len(res_list)>1:
        return res_list[1].split('/')[0]
    else:
        return lexeme

def generate_sem_info(sem_str):
    sem_list =[]
    sem_list.extend(sem_str.split()[1:])
    res_dict = {sem_t.split(':')[0] : sem_t.split(':')[1:] for sem_t in sem_list}
    return res_dict

def generate_pos_extended(posx_str):
    if posx_str == '*':
        return ''
    return posx_str


def ru_sem_info_mystem_nested(token):
    if token in ['A', 'ADV', 'ADVPRO', 'ANUM', 'APRO', 'COM', 'CONJ', 'INTJ', 'NUM', 'PART', 'PR', 'S', 'SPRO', 'V']:
            return 'pos', token.lower()
    elif token in ['praes', 'inpraes', 'praet']:
        return 'tense', token
    elif token in ['nom', 'gen', 'dat', 'acc', 'ins', 'abl', 'part', 'loc', 'voc']:
        return 'case', token
    elif token in ['sg', 'pl']:
        return 'number', token
    elif token in ['ger', 'inf', 'partcp', 'indic', 'imper']:
        return 'mood', token
    elif token in ['brev', 'plen', 'poss']:
        return 'variant', token
    elif token in ['supr', 'comp']:
        return 'degree', token
    elif token in ['1p', '2p', '3p']:
        return 'person', token
    elif token in ['m', 'f', 'n']:
        return 'gender', token
    elif token in ['ipf', 'pf']:
        return 'aspect', token
    elif token in ['act', 'pass']:
        return 'voice', token
    elif token in ['anim', 'inan']:
        return 'anim', token
    elif token in ['tran', 'intr']:
        return 'transit', token
    elif token != "":
        return 'extra', token

def ru_generate_sem_info_mystem(info):
    if info is None:
        return None
    (certain_info, uncertain_info) = info.split('=')
    res = {}
    for token in certain_info.split(','):
        key, value = ru_sem_info_mystem_nested(token)
        if key == 'extra':
            if 'extra' in res.keys():
                res['extra'].append(value)
            else:
                res['extra'] = [value]
        else:
            res[key] = value
    if len(uncertain_info) > 2:
        if uncertain_info.startswith('('):
            uncertain_info = uncertain_info[1:-1]
        res_temp = []
        for token_string in uncertain_info.split('|'):
            if token_string == '':
                continue
            tt = {}
            for token in token_string.split(','):
                key, value = ru_sem_info_mystem_nested(token)
                if key == 'extra':
                    if 'extra' in tt.keys():
                        tt['extra'].append(value)
                    else:
                        tt['extra'] = [value]
                else:
                    tt[key] = value
            res_temp.append(tt)
        if len(res_temp) > 1:
            res['uncertain'] = res_temp
        else:
            return {**res, **res_temp[0]}
    return res

# https://github.com/olesar/ruUD/blob/master/conversion/RNCtoUD.md
rnnmp2mystem = {'fut': 'inpraes', 'pres': 'praes', 'past': 'praet', 'unknown': 'unknown',
'masc': 'm', 'neut': 'n', 'fem': 'f',
'ind': 'indic', 'imp': 'imper',
'2': '2p', '3': '3p', '1': '1p',
'plur': 'pl', 'sing': 'sg',
'part': 'partcp', 'trans': 'tran',
'sup': 'supr', 'cmp': 'comp',
'adj': 'a', 'adp': 'pr', 'det': 'apro', 'verb': 'v', 'pron': 'spro', 'noun': 's'}

def ru_generate_sem_info_rnnmp(info):
    res = {}
    for token_pair in info.lower().split('|'):
        if len(token_pair.split('=')) == 2:
            res[token_pair.split('=')[0]] = token_pair.split('=')[1]
    for key in res:
        if res[key] in rnnmp2mystem.keys():
            res[key] = rnnmp2mystem[res[key]]
    return res

def remove_extras(mst_info):
    mst_info['uncertain'] = [item for item in mst_info['uncertain'] if len(item.items()) > 0]
    return mst_info

def remove_uncertainty(mystem_info, rnnmp_info):
    mystem_info = remove_extras(mystem_info)

    length_cmp = [(len(item.keys()), id) for id, item in enumerate(mystem_info['uncertain'])]
    if len(set([item[0] for item in length_cmp])) > 1:
        min_attrs = min([item[0] for item in length_cmp])
        mystem_info['uncertain'] = [item for item in mystem_info['uncertain'] if len(item.keys()) == min_attrs]

    sorted_keywise = {k: [dic[k] for dic in mystem_info['uncertain']] for k in mystem_info['uncertain'][0]}

    res = dict(mystem_info)

    for key in sorted_keywise:
        if len(set(sorted_keywise[key])) == 1:
            res[key] = sorted_keywise[key][0]
        elif key in rnnmp_info.keys():
            if rnnmp_info[key] != 'unknown':# and rnnmp_info[key] in sorted_keywise[key]:
                res[key] = rnnmp_info[key]
    uncertainty_removed = True
    for key in sorted_keywise:
        if not key in res.keys():
            uncertainty_removed = False
            break
        else:
            for item in res['uncertain']:
                del(item[key])
    if uncertainty_removed:
        del(res['uncertain'])

    if 'number' in rnnmp_info.keys() and 'number' in res.keys() and rnnmp_info['number'] != res['number']:
        res['number'] = rnnmp_info['number']

    return res

In [3]:
def extract_gr(token):
    if len(token['analysis']) > 0:
        return token['analysis'][0]['gr']
    else:
        return None

def extract_lex(token):
    if len(token[0][0]['analysis']) > 0:
        return token[0][0]['analysis'][0]['lex']
    else:
        return ''

def process_rus_string(rus_string):

    mystem_analysis = mystem.analyze(rus_string)
    forms = rnnm.predict([token['text'] for token in mystem_analysis if 'analysis' in token.keys()])
    temp_res = []

    for token in zip([(token, ru_generate_sem_info_mystem(extract_gr(token))) for token in mystem_analysis if 'analysis' in token.keys()],
                                    [(ru_generate_sem_info_rnnmp(token.tag), token.score) for token in forms]):

        if not token[0][1] is None:
            if 'uncertain' in token[0][1].keys():
                temp_res.append(remove_uncertainty(token[0][1], token[1][0]))
            else:
                temp_res.append(token[0][1])
        else:
            temp_res.append({})
        temp_res[-1]['text'] = token[0][0]['text']
        temp_res[-1]['lexeme'] = extract_lex(token)
        temp_res[-1]['reliability_score'] = str(token[1][1])

    res = []

    for item in temp_res:
        res.append({'text': item['text']})
        if 'lexeme' in item.keys():
            res[-1]['lexeme'] = item['lexeme']
            res[-1]['gr'] = {}
            keys = set(item.keys())
            keys.remove('text')
            keys.remove('lexeme')
            for key in keys:
                res[-1]['gr'][key] = item[key]

    final_res = []
    idx = 0

    for item in mystem_analysis:
        if 'analysis' in item.keys():
            final_res.append(res[idx])
            idx = idx + 1
        else:
            final_res.append(item)

    return final_res

def reconstruct_text(processed_line, pr=False):
    if not pr:
        return "".join(item['text'] for item in processed_line)
    print("".join(item['text'] for item in processed_line))

In [4]:
# http://nlp.ist.i.kyoto-u.ac.jp/index.php?plugin=attach&refer=KNP&openfile=knp_feature.pdf
jp_conv = {'略称': 'abbr',
           '語幹': 'stem',
           #content word
           '内容語': 'content',
           # Parts of speech
           #adjective
           '形容詞': 'a',
           #adnominal adjective
           '連体詞': 'ada',
           #adjective that ends with 'i'
           'イ形容詞イ段': 'iadj',
           #adjective that ends with 'na'
           'ナ形容詞': 'naadj',
           #adverb
           '副詞': 'adv',
           #judgemental
           '判定詞': 'jud',
           #auxiliary verb
           '助動詞': 'av',
           #conjunction
           '接続詞': 'conj',
           #demonstrative
           '指示詞': 'dem',
           #nominal demonstrative
           '名詞形態指示詞': 'sdem',
           #adnominal adjective demonstrative
           '連体詞形態指示詞': 'adadem',
           #adverbial demonstrative
           '副詞形態指示詞': 'advdem',
           #interjection
           '感動詞': 'interj',
           #noun
           '名詞': 's',
           #common noun
           '普通名詞': 'common',
           #adverbial noun
           '副詞的名詞': 'advs',
           #expletive noun
           '形式名詞': 'expletive',
           #proper noun
           '固有名詞': 'props',
           '組織名': 'organisation',
           #toponym
           '地名': 'geo',
           #name and name retrieved through Wikipedia
           '人名': 'human_name',
           'Wikipedia人名': 'human_name',
           #can form a verb by adding "suru" to a noun, nominal verb
           'サ変名詞': 'sv',
           #verbial noun
           'サ変動詞': 'vs',
           #auxiliary noun
           '準内容語': 'as',
           #numeral
           '数詞': 'num',
           #temporal noun
           '時相名詞': 'temporal',
           #'weak' temporal noun
           '弱時相名詞': 'temporal',
           #verb
           '動詞': 'v',
#            #consonantal verb
#            '子音動詞ラ行': 'consonantal-ra',\
#            '子音動詞ワ行': 'consonantal-wa',\
#            '子音動詞マ行': 'consonantal-ma',\
#            '母音動詞': 'vowel-stem',\
           #conditional judgemental
           'デアル列基本条件形': 'cond',
           #past judgemental
           'デアル列タ形': 'praet',
           #continuous judgemental
           'ダ列タ系連用テ形': 'cont',
           #imperfective form
           '未然形': 'ipf',
           #plain form, also applicable to adjectives
           '基本形': 'plain',
           #irregular verb
           'カ変動詞': 'irregular',
           #conjunctive form (when occurring within verbial composites) that acts like a verb(?)
           '基本連用形': 'conjunctive-v',
            #conjunctive form that acts like a noun(?)
           '連用形名詞化': 'conjunctive-s',
           #"ta"-form; past form
           'タ形': 'praet',
           #continuous tense
           'タ系連用テ形': 'cont',
           #particle
           '助詞': 'part',
           #case-marking particle
           '格助詞': 'case-marking',
           #adverbial particle
           '副助詞': 'adverbial',
           #conjunctive particle
           '接続助詞': 'conjunctive',
           #sentence-ending particle
           '終助詞': 'sentence-ending',
           #prefix
           '接頭辞': 'pref',
           #nominal prefix
           '名詞接頭辞': 'nominal',
           #verbal prefix
           '動詞接頭辞': 'verbal',
           #prefix for adjectives ending with 'i'
           'イ形容詞接頭辞': 'iadj',
           #prefix for adjectives ending with 'na'
           'ナ形容詞接頭辞': 'naadj',
           #suffix
           '接尾辞': 'suff',
           #nominal suffix
           '名詞性名詞接尾辞': 'nominal',
           #predicative nominal suffix
           '名詞性述語接尾辞': 'predicative',
           #suffix somehow related to counting
           '名詞性名詞助数辞': 'counting',
           #special nominal suffix
           '名詞性特殊接尾辞': 'special',
           #predicative adjective suffix
           '形容詞性述語接尾辞': 'predicative',
           #suffix that turns a noun into an adjective ('-like')
           '形容詞性名詞接尾辞': 'adjectivising',
           #verbial suffix
           '動詞性接尾辞': 'verbial',
           #special symbol
           '特殊': 'special',
           '句点': 'period',
           '読点': 'comma',
           #opening parenthesis
           '括弧始': 'parenthop',
           #ending parenthesis
           '括弧終': 'parenthed',
           #sign/mark
           '記号': 'sign',
           '空白': 'whitespace',
           #unidentified symbols
           '未定義語': 'unspecified',
           '未知語': 'unknown_characters',
           #katakana characters
           'カタカナ': 'katakana',
           #alphanumeric characters
           'アルファベット': 'alpha',
           '数字': 'numeric',
           #other characters
           'その他': 'other',
           #suspected representation (cases such as when 2 is written instead of 二 [two])
           '疑似代表表記': 'suspected_representation',
           #category
           'カテゴリ': 'category',
           '人工物-その他': 'artificial-other',
           '抽象物': 'abstract',
           '組織・団体': 'organisation',
           '場所-その他': 'place-other',
           '場所-施設': 'place-establishment',
           '人': 'human',
           '数量': 'quantity',
           '時間': 'time',
           '場所-機能': 'place-facility',
           '自然物': 'natural',
           '場所-自然': 'place-natural',
           #domain
           'ドメイン': 'domain',
           '政治': 'politics',
           '家庭・暮らし': 'household',
           '代表表記': 'writrepr',
           '料理・食事': 'cooking-meals',
           '文化・芸術': 'culture-art',
           'メディア': 'media',
           'ビジネス': 'business',
           '交通': 'traffic',
           #family name
           'Wikipedia姓': 'famn',
           '姓': 'famn',
           #first name
           'Wikipedia名': 'persn',
           '名': 'persn',
           #name type
           '日本': 'japanese',
           '外国': 'foreign',
           #special reading
           '漢字読み': 'reading_type',
           '音': 'on',
           '訓': 'kun',
           #address ender
           '住所末尾': 'address_ender',
           #toponym ender
           '地名末尾': 'toponym_ender',
           #reading unknown
           '読み不明': 'unknown_reading',
           '自動獲得': 'acquired_automatically',
           #reflexivity
           '自他動詞': 'transitivity',
           '自': 'tran',
           '他': 'intran',
           '同形': 'same_form',
           #word looked up at Wikipedia
           'Wikipediaリダイレクト': 'wiki_redirected',
           #word derived from verb
           '動詞派生': 'derivative',
           #potential form of a verb
           '可能動詞': 'potential'
          }

def conv2eng(string):
    if string in jp_conv.keys():
        return jp_conv[string]
    return string

def is_suspected(token):
    if token.imis != 'NIL':
        for item in [it.split(':') for it in token.imis.split()]:
            if conv2eng(item[0]) == 'suspected_representation':
                return True
    return False

def process_ja_string(ja_string):
    parsed_str = knp.parse(ja_string)
    res = []
    for item in parsed_str.mrph_list():
        res.append({'text': item.midasi})
        if conv2eng(item.hinsi) == "special" or item.midasi == "～":
            continue
        res[-1]['gr'] = {}
        res[-1]['gr']['extra'] = set([])
        res[-1]['lexeme'] = item.genkei
        res[-1]['reading'] = converter.do(item.yomi)
        res[-1]['gr']['unsorted'] = []
        # TODO
        #res[-1]['translation']
        res[-1]['gr']['pos'] = conv2eng(item.hinsi)
        if item.bunrui != '*':
            res[-1]['gr']['extra'].add(conv2eng(item.bunrui))
        if item.katuyou1 != '*':
            if conv2eng(item.katuyou1) != item.katuyou1:
                res[-1]['gr']['extra'].add(conv2eng(item.katuyou1))
            else:
                res[-1]['gr']['unsorted'].append(item.katuyou1)
        if item.katuyou2 != '*':
            if conv2eng(item.katuyou2) in ['ipf', 'pf']:
                res[-1]['gr']['aspect'] = conv2eng(item.katuyou2)
            elif conv2eng(item.katuyou2) in ['inpraes', 'praet']:
                res[-1]['gr']['tense'] = conv2eng(item.katuyou2)
            elif conv2eng(item.katuyou2) != item.katuyou2:
                res[-1]['gr']['extra'].add(conv2eng(item.katuyou2))
            else:
                res[-1]['gr']['unsorted'].append(item.katuyou2)
        if item.imis != 'NIL':
            split_sem_info = [it.split(':') for it in item.imis.split()]
            for sem_it in split_sem_info:
                if conv2eng(sem_it[0]) == 'writrepr':
                    res[-1]['normal_form'] = sem_it[1].split('/')[0]
                elif conv2eng(sem_it[0]) == 'category':
                    res[-1]['gr']['category'] = [conv2eng(cat_it) for cat_it in sem_it[1].split(';')]
                elif conv2eng(sem_it[0]) == 'domain':
                    res[-1]['gr']['domain'] = [conv2eng(dom_it) for dom_it in sem_it[1].split(';')]
                elif conv2eng(sem_it[0]) == 'geo':
                    for id, geo_it in enumerate(sem_it):
                        if conv2eng(geo_it) == 'abbr':
                            res[-1]['gr']['extra'].add('abbr')
                            res[-1]['normal_form'] = sem_it[id + 1]
                            break
                elif conv2eng(sem_it[0]) == 'reading_type':
                    res[-1]['gr']['reading_type'] = conv2eng(sem_it[1])
                elif conv2eng(sem_it[0]) == 'conjunctive-s':
                    res[-1]['normal_form'] = sem_it[1].split('/')[0]
                elif conv2eng(sem_it[0]) == 'transitivity':
                    if conv2eng(sem_it[1]) == 'intran':
                        #sem_it[1] can also be equal to '同形',
                        #apparently meaning that transitive and intransitive forms are the same
                        #since knp provides no information on whether the verb in question
                        #is transitive or intransitive, the 'transitivity' field is left out
                        res[-1]['gr']['transit'] = "tran"
                        res[-1]['normal_form'] = sem_it[2].split('/')[0]
                    elif conv2eng(sem_it[1]) == "tran":
                        res[-1]['gr']['transit'] = "intran"
                elif conv2eng(sem_it[0]) == 'derivative':
                    res[-1]['gr']['extra'].add(conv2eng(sem_it[0]))
                    res[-1]['normal_form'] = sem_it[1].split('/')[0]
                elif conv2eng(sem_it[0]) == 'potential':
                    res[-1]['gr']['extra'].add(conv2eng(sem_it[0]))
                    res[-1]['normal_form'] = sem_it[1].split('/')[0]
                elif conv2eng(sem_it[0]) == 'human_name':
                    if not sem_it[0].startswith('Wikipedia'):
                        if conv2eng(sem_it[1]) == "japanese":
                            res[-1]['gr']['extra'].add(conv2eng(sem_it[2]))
                            res[-1]['gr']['reliability_score'] = sem_it[4]
                        else:
                            res[-1]['gr']['extra'].add(conv2eng(sem_it[0]))
                            res[-1]['gr']['extra'].add(conv2eng(sem_it[1]))
                            res[-1]['gr']['reliability_score'] = 0
                    else:
                        res[-1]['gr']['extra'].add(conv2eng(sem_it[0]))
                elif conv2eng(sem_it[0]) == 'wiki_redirected':
                    res[-1]['gr']['extra'].add('acquired_automatically')
                    res[-1]['normal_form'] = sem_it[1]
                elif conv2eng(sem_it[0]) != sem_it[0]:
                    res[-1]['gr']['extra'].add(conv2eng(sem_it[0]))
                else:
                    res[-1]['gr']['unsorted'].append([it for it in sem_it])

            if "unknown_characters" in res[-1]['gr']['extra']:
                del(res[-1]['normal_form'])
                del(res[-1]['lexeme'])
                unk_type = conv2eng([it.split(':')[1] for it in item.imis.split(' ')\
                             if conv2eng(it.split(':')[0]) == "unknown_characters"][0])
                res[-1]['gr']['extra'].add(unk_type)
                if unk_type != "katakana" or res[-1]['reading'] == res[-1]['text']:
                    del(res[-1]['reading'])

        if len(res[-1]['gr']['unsorted']) == 0:
            del(res[-1]['gr']['unsorted'])
        if len(res[-1]['gr']['extra']) == 0:
            del(res[-1]['gr']['extra'])

    #print(res)
    return ja_replace_sets_with_lists(res)

#sets are used because some pieces of information that KNP yields have identical mappings
def ja_replace_sets_with_lists(token_set):
    res = token_set
    for token_idx in range(0, len(res)):
        if 'gr' in res[token_idx].keys() and 'extra'\
        in res[token_idx]['gr'].keys():
            res[token_idx]['gr']['extra'] = list(res[token_idx]['gr']['extra'])
    return res

In [5]:
txts = glob.glob('*.txt')
fnames = list(enumerate([fname for fname in txts]))

entries = {first : Entry.from_json(second) for (first, second) in fnames}

In [None]:
processed = []

for _, entry in list(entries.items()):
    title_ja = process_ja_string(entry.title_jp)
    title_ru = process_rus_string(entry.title_ru)

    body = {}

    for line in entry.contents:
        body[line[0]] = (process_rus_string(line[1][0]), process_ja_string(line[1][1]))

    processed.append(ProcessedEntry(id = entry.id,
                                   title_ru = title_ru,
                                   contents = body,
                                   title_jp = title_ja,
                                   link_jp = entry.link_jp,
                                   link_ru = entry.link_ru))

    processed[-1].to_json("./processed_biling/processed_entry_{}.json".format(processed[-1].id))
    print(entry.id, ": Success")

print('All done!')

In [12]:
print(reconstruct_text(processed[-1].title_ru))
print(reconstruct_text(processed[-1].title_jp))

В городе Хакусан нашлись кровати русских военнопленных, арестованных во время русско-японской войны

ロシア捕虜のベッドか　日露戦争時、金沢で収容　白山市で見つかる
