We add for given properties/relations:
- alias from wikidata
- the google translation (unless its already part of the aliases)
- translate every word in the relation

We do this to reduce the selectional bias of property names.

In [15]:
from tqdm import tqdm
import pandas as pd
from wikidata.client import Client
import translators as ts
from googletrans import Translator
from collections import defaultdict

In [16]:
wikiclient = Client()
translator = Translator()

#### Load Properties

In [142]:
relations = pd.read_csv('../data/knowledge/zh_ja_relations.csv')

In [143]:
relations

Unnamed: 0,id,zh,ja,count
0,P1034,主要食物來源,主な食料源,326
1,P457,成立文书,設立文,50541
2,P532,船籍港,船籍港,25629
3,P740,成立地點,結成場所,44482
4,P1817,收件人,宛名,49021
...,...,...,...,...
509,P31,隶属于,分類,103114325
510,P1441,登场作品,登場する作品,126749
511,P828,起因,原因,9257
512,P1542,導致,以下の原因,5715


In [144]:
languages = ['zh', 'ja']

In [145]:
relations = relations.drop(relations[relations.id == 'P8929'].index)

#### Load aliases

In [146]:
alias_dict = defaultdict(list)

for idx, relation in tqdm(relations.iterrows(), total=relations.shape[0]):
    
    entity = wikiclient.get(relation['id'], load=True)
    aliases = entity.attributes['aliases']
    
    for lang in languages:
        # Does language have aliases? 
        if lang in aliases:
            alias_dict[lang].append([al['value'] for al in aliases[lang]])
        else:
            alias_dict[lang].append(None)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 514/514 [02:34<00:00,  3.34it/s]


In [147]:
for lang in languages:
    relations[lang + '_alias'] = alias_dict[lang]

In [148]:
relations

Unnamed: 0,id,zh,ja,count,zh_alias,ja_alias
0,P1034,主要食物來源,主な食料源,326,[主要食物来源],[主食]
1,P457,成立文书,設立文,50541,,
2,P532,船籍港,船籍港,25629,"[船舶注册港, 船舶註冊港, 註冊港, 注册港]",
3,P740,成立地點,結成場所,44482,,[創設場所]
4,P1817,收件人,宛名,49021,,"[宛名書き, 宛先]"
...,...,...,...,...,...,...
509,P31,隶属于,分類,103114325,"[是一個, 屬於, 屬性, ∈, 性质, 属性, 是一个, 性質, 属于]","[種類, インスタンスの元, 実体の元, is a, is-a, クラス, 以下の実体, ∈..."
510,P1441,登场作品,登場する作品,126749,[出场作品],
511,P828,起因,原因,9257,"[原因, 由于]",
512,P1542,導致,以下の原因,5715,"[结果, 造成, 导致, 影响, 影響]","[結果, 帰結, 成り行き]"


#### Load Translations Aliases
Translation of relation in all other languages into the target language (e.g. P3719 target 'de', translate 'en', 'es' and 'fr' to 'de' and put them into a list. Again to reduce selection bias. Manual cleanup required!

In [149]:
translate_dict = defaultdict(list)

In [150]:
for target_lang in languages:

    # Iterate over relations
    for idx, relation in tqdm(relations.iterrows(), total=relations.shape[0]):

        translations = []
        for source_lang in languages:
            if source_lang == target_lang:
                continue                

            # Translate and add if not same as relation
            # try:
            #     translation = ts.google(relation[source_lang], from_language=source_lang, to_language=target_lang)
            # except:
            if source_lang == 'zh':
                translation = translator.translate(relation[source_lang], src='zh-tw', dest=target_lang).text
            elif target_lang == 'zh':
                translation = translator.translate(relation[source_lang], src=source_lang, dest='zh-tw').text
            else:
                translation = translator.translate(relation[source_lang], src=source_lang, dest=target_lang).text

            if translation != relation[target_lang] and translation.lower() != relation[target_lang].lower():
                translations.append(translation)

        # Remove duplicates
        translations = list(set(translations))

        # If empty, use None instead
        if not translations:
            translations = None

        # Add translations for this language for that 
        translate_dict[target_lang].append(translations)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 514/514 [01:02<00:00,  8.27it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 514/514 [00:56<00:00,  9.11it/s]


In [151]:
translate_dict.keys()

dict_keys(['zh', 'ja'])

In [152]:
for lang in languages:
    relations[lang + '_translate_alias'] = translate_dict[lang]

In [153]:
relations

Unnamed: 0,id,zh,ja,count,zh_alias,ja_alias,zh_translate_alias,ja_translate_alias
0,P1034,主要食物來源,主な食料源,326,[主要食物来源],[主食],,[主食源]
1,P457,成立文书,設立文,50541,,,[成立聲明],[設立書類]
2,P532,船籍港,船籍港,25629,"[船舶注册港, 船舶註冊港, 註冊港, 注册港]",,[船舶登記港],[ホームポート]
3,P740,成立地點,結成場所,44482,,[創設場所],[形成地點],[設立]
4,P1817,收件人,宛名,49021,,"[宛名書き, 宛先]",[地址],[受信者]
...,...,...,...,...,...,...,...,...
509,P31,隶属于,分類,103114325,"[是一個, 屬於, 屬性, ∈, 性质, 属性, 是一个, 性質, 属于]","[種類, インスタンスの元, 実体の元, is a, is-a, クラス, 以下の実体, ∈...",[分類],[に属する]
510,P1441,登场作品,登場する作品,126749,[出场作品],,[出現的作品],[デビュー作]
511,P828,起因,原因,9257,"[原因, 由于]",,[原因],
512,P1542,導致,以下の原因,5715,"[结果, 造成, 导致, 影响, 影響]","[結果, 帰結, 成り行き]",[以下原因],[引き起こす]


#### Load subword translations

For every relation we take every possible target_language that we want to know if it is being shared into. We iterate over all words of the relation in all other languages and translate them into the target language and save them as subword aliases. My reasoning is that mBERT might sometimes just ignore certain words in a relation and understand them not as a whole in its context but just makse connects with some words.

In [154]:
subword_dict = defaultdict(list)

In [155]:
# Iterate over relations
for idx, relation in tqdm(relations.iterrows(), total=relations.shape[0]):
    
    # For every language
    for target_lang in languages:
        
        translations = []
        for source_lang in languages:
            if source_lang == target_lang:
                continue
            
            if source_lang == 'zh' or source_lang == 'ja':
                words = list(relation[source_lang])
            else:
                words = relation[source_lang].split()
            
            if len(words) == 1:
                continue
            
            for word in words:
                # Translate
                if source_lang == 'zh':
                    translation = translator.translate(word, src='zh-tw', dest=target_lang).text
                elif target_lang == 'zh':
                    translation = translator.translate(word, src=source_lang, dest='zh-tw').text
                else:
                    translation = translator.translate(word, src=source_lang, dest=target_lang).text

                translations.append(translation)

        # Remove duplicates
        translations = list(set(translations))

        # If empty, use None instead
        if not translations:
            translations = None

        # Add translations for this language for that 
        subword_dict[target_lang].append(translations)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 514/514 [04:42<00:00,  1.82it/s]


In [156]:
subword_dict.keys()

dict_keys(['zh', 'ja'])

In [157]:
for lang in languages:
    relations[lang + '_subword_alias'] = subword_dict[lang]

#### Save

In [158]:
relations

Unnamed: 0,id,zh,ja,count,zh_alias,ja_alias,zh_translate_alias,ja_translate_alias,zh_subword_alias,ja_subword_alias
0,P1034,主要食物來源,主な食料源,326,[主要食物来源],[主食],,[主食源],"[費用, 食物, 來源, 主要的, 娜娜]","[食べ物, 亭主, ソース, 欲しいです, もの, 来て]"
1,P457,成立文书,設立文,50541,,,[成立聲明],[設立書類],"[常設, 句子, 環境]","[芸術, 台, 本, なる]"
2,P532,船籍港,船籍港,25629,"[船舶注册港, 船舶註冊港, 註冊港, 注册港]",,[船舶登記港],[ホームポート],"[登記, 港口, 船]","[メンバーシップ, ポート, ボート]"
3,P740,成立地點,結成場所,44482,,[創設場所],[形成地點],[設立],"[地方, 鳴, 結論]","[台, 土地, なる, 点]"
4,P1817,收件人,宛名,49021,,"[宛名書き, 宛先]",[地址],[受信者],"[姓名, 到]","[ピース, 受け取る, 人]"
...,...,...,...,...,...,...,...,...,...,...
509,P31,隶属于,分類,103114325,"[是一個, 屬於, 屬性, ∈, 性质, 属性, 是一个, 性質, 属于]","[種類, インスタンスの元, 実体の元, is a, is-a, クラス, 以下の実体, ∈...",[分類],[に属する],"[分鐘, 種類]","[で, スクライブ, 属]"
510,P1441,登场作品,登場する作品,126749,[出场作品],,[出現的作品],[デビュー作],"[地方, 醋, 商品, 登, 茹, 書面]","[登る, 味, 行う, 分野]"
511,P828,起因,原因,9257,"[原因, 由于]",,[原因],,"[原來的, 原因]","[上昇, なぜなら]"
512,P1542,導致,以下の原因,5715,"[结果, 造成, 导致, 影响, 影響]","[結果, 帰結, 成り行き]",[以下原因],[引き起こす],"[在下面, 原來的, 的, 之後, 原因]","[に, ガイド]"


In [159]:
lang_path = ''
for i, lang in enumerate(languages):
        lang_path += lang + '_'

relations.to_json('../data/knowledge/' + lang_path + 'relations_w_aliases_new.json')