We add for given properties/relations:
- alias from wikidata
- the google translation (unless its already part of the aliases)
- translate every word in the relation

We do this to reduce the selectional bias of property names.

In [10]:
from tqdm import tqdm
import pandas as pd
from wikidata.client import Client
import translators as ts
from googletrans import Translator

In [11]:
wikiclient = Client()
translator = Translator()

#### Load Properties

In [7]:
# relations = pd.read_csv('../data/Knowledge/properties_nonsymmetric_multilingual_clean.csv')
relations = pd.read_json('property_notsymmetric_labeled_w_aliases2.json')

In [8]:
relations

Unnamed: 0,id,en,de,es,fr,count,en_alias,de_alias,es_alias,fr_alias,en_translate_alias,de_translate_alias,es_translate_alias,fr_translate_alias
0,P3719,regulated by,reguliert von,regulado por,régi par,461,"[regulator, regulatory authority]",,[regulada por],"[régie par, régulé par, régulée par, réglement...",[governed by],[regiert durch],"[regulada por, gobernado por]",[régulée par]
1,P1704,is pollinator of,Bestäuber von,polinizador,pollinisateur de,58,"[pollinator of, pollinates]",,,"[pollinise, plante pollinisée]","[pollinator of, pollinator]","[Bestäuber, ist Bestäuber von]","[es polinizador de, polinizador de]","[est pollinisateur de, pollinisateur]"
2,P3173,offers view on,bietet Sicht auf,ofrece vista a,offre une vue sur,935,"[has view of, has views of, offers view of, vi...","[Sicht auf, Aussicht]","[tiene vista a, ofrece una vista de, ofrece vi...","[offre un panorama sur, vue sur, a vue sur, jo...","[offers view to, offers a view of]","[bietet Aussicht auf, bietet einen Ausblick auf]","[ofrece ver en, ofrece una vista de]",
3,P3085,qualifies for event,qualifiziert für Turnier,califica para el evento,qualifie pour,2770,[league this event qualifies for],,,[ligue à laquelle l'événement qualifie],"[qualified for tournament, qualify for the eve...","[qualifiziert sich für die Veranstaltung, sich...","[calificado para el torneo, califica para]","[se qualifier pour l'événement, qualifié pour ..."
4,P4646,representation of,Vorstellung von,representación de,représentation de,167,,,,,[idea of],[Repräsentation von],"[idea de, representacion de]","[idée de, la représentation de]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,P5769,editor-in-chief,Chefredakteur,redactor en jefe,responsable éditorial,2248,"[executive editor, editor in chief, lead edito...",[Chefredakteurin],"[editor en jefe, editor ejecutivo, editor jefe]","[directeur de la rédaction, directrice de la r...","[editorial manager, chief editor]",[Redaktionsleiter],"[gerente editorial, editor en jefe]","[éditeur en chef, rédacteur en chef]"
730,P8627,closing time,Schließzeit,hora de cierre,heure de fermeture,5,[closes at],"[geschlossen um, geschlossen ab, geöffnet bis]",[cierra a],[moment de fermeture],[closure hour],[Geschäftsschluss],,
731,P2770,source of income,Herkunft des Einkommens,fuente de ingreso,source de revenu,494,"[income source, source of revenue, revenue sou...",[Einkommensherkunft],[fuente de renta],,[Origin of Income],[Einkommensquelle],[Origen de los Ingresos],[Origine du revenu]
732,P4794,season starts,Saisonstart,inicio de la temporada,début de la saison,2049,[season begins],[Saisonbeginn],[comienzo de la temporada],,[start of the season],"[Saison beginnt, Beginn der Saison]","[inicio de temporada, comienza la temporada]","[début de saison, la saison commence]"


In [12]:
languages = ['en', 'de', 'es', 'fr']

#### Load aliases

In [8]:
alias_dict = dict()
for lang in languages:
    alias_dict[lang] = list()

for idx, relation in relations.iterrows():
    
    entity = wikiclient.get(relation['id'], load=True)
    aliases = entity.attributes['aliases']
    
    for lang in languages:
        # Does language have aliases? 
        if lang in aliases:
            alias_dict[lang].append([al['value'] for al in aliases[lang]])
        else:
            alias_dict[lang].append(None)

In [47]:
relations['en_alias'] = alias_dict['en']

In [48]:
relations

Unnamed: 0,id,en,de,es,fr,count,en_alias,de_alias,es_alias,fr_alias
0,P3719,regulated by,reguliert von,regulado por,régi par,461,"[regulator, regulatory authority]",,[regulada por],"[régie par, régulé par, régulée par, réglement..."
1,P1704,is pollinator of,Bestäuber von,polinizador,pollinisateur de,58,"[pollinator of, pollinates]",,,"[pollinise, plante pollinisée]"
2,P3173,offers view on,bietet Sicht auf,ofrece vista a,offre une vue sur,935,"[has view of, has views of, offers view of, vi...","[Sicht auf, Aussicht]","[tiene vista a, ofrece una vista de, ofrece vi...","[offre un panorama sur, vue sur, a vue sur, jo..."
3,P3085,qualifies for event,qualifiziert für Turnier,califica para el evento,qualifie pour,2770,[league this event qualifies for],,,[ligue à laquelle l'événement qualifie]
4,P4646,representation of,Vorstellung von,representación de,représentation de,167,,,,
...,...,...,...,...,...,...,...,...,...,...
729,P5769,editor-in-chief,Chefredakteur,redactor en jefe,responsable éditorial,2248,"[executive editor, editor in chief, lead edito...",[Chefredakteurin],"[editor en jefe, editor ejecutivo, editor jefe]","[directeur de la rédaction, directrice de la r..."
730,P8627,closing time,Schließzeit,hora de cierre,heure de fermeture,5,[closes at],"[geschlossen um, geschlossen ab, geöffnet bis]",[cierra a],[moment de fermeture]
731,P2770,source of income,Herkunft des Einkommens,fuente de ingreso,source de revenu,494,"[income source, source of revenue, revenue sou...",[Einkommensherkunft],[fuente de renta],
732,P4794,season starts,Saisonstart,inicio de la temporada,début de la saison,2049,[season begins],[Saisonbeginn],[comienzo de la temporada],


#### Load Translations Aliases
Translation of relation in all other languages into the target language (e.g. P3719 target 'de', translate 'en', 'es' and 'fr' to 'de' and put them into a list. Again to reduce selection bias. Manual cleanup required!

In [14]:
translate_dict = dict()
for lang in languages:
    translate_dict[lang] = list()

Unfortunately this has to be done per language

In [22]:
target_lang = 'fr'

# Iterate over relations
for idx, relation in tqdm(relations.iterrows(), total=relations.shape[0]):

    translations = []
    for source_lang in languages:
        if source_lang == target_lang:
            continue

        # Translate and add if not same as relation
        # try:
        #     translation = ts.google(relation[source_lang], from_language=source_lang, to_language=target_lang)
        # except:
        translation = translator.translate(relation[source_lang], src=source_lang, dest=target_lang).text

        if translation != relation[target_lang]:
            translations.append(translation)

    # Remove duplicates
    translations = list(set(translations))

    # If empty, use None instead
    if not translations:
        translations = None

    # Add translations for this language for that 
    translate_dict[target_lang].append(translations)

100%|███████████████████████████████████████████████████████████████████████| 734/734 [04:59<00:00,  2.45it/s]


In [26]:
translate_dict['es']

[['regulada por', 'gobernado por'],
 ['es polinizador de', 'polinizador de'],
 ['ofrece ver en', 'ofrece una vista de'],
 ['calificado para el torneo', 'califica para'],
 ['idea de', 'representacion de'],
 ['emulado'],
 ['principal fuente de alimento'],
 ['contrato de constitución'],
 ['Puerto de registro', 'distrito de registro'],
 ['lugar de incorporacion', 'lugar de fundación'],
 ['beneficiario'],
 ['régimen político', 'forma basica de gobierno'],
 None,
 ['competición deportiva',
  'Participación en evento deportivo.',
  'competencia deportiva compitió en'],
 ['derivado del tipo de organismo'],
 None,
 ['mundo natal'],
 ['empuñadura de raqueta', 'jugando a la mano'],
 ['a como una persona gramatical', 'tiene persona gramatical'],
 None,
 None,
 ['ubicación de grabación', 'lugar de registro', 'lugar de grabacion'],
 ['artista del juego', 'artista multimedia'],
 ['color de linea', 'color del trazo'],
 ['no causa', 'no tiene efecto'],
 ['inversor'],
 None,
 ['praenomen romano', 'nombr

In [30]:
relations['fr_translate_alias'] = translate_dict['fr']

#### Load subword translations

For every relation we take every possible target_language that we want to know if it is being shared into. We iterate over all words of the relation in all other languages and translate them into the target language and save them as subword aliases. My reasoning is that mBERT might sometimes just ignore certain words in a relation and understand them not as a whole in its context but just makse connects with some words.

In [13]:
subword_dict = dict()
for lang in languages:
    subword_dict[lang] = list()

In [14]:
# Iterate over relations
for idx, relation in tqdm(relations.iterrows(), total=relations.shape[0]):
    
    # For every language
    for target_lang in languages:
        
        translations = []
        for source_lang in languages:
            if source_lang == target_lang:
                continue
            
            words = relation[source_lang].split()
            
            if len(words) == 1:
                continue
            
            for word in words:
                # Translate
                translation = translator.translate(word, src=source_lang, dest=target_lang).text
                translations.append(translation)

        # Remove duplicates
        translations = list(set(translations))

        # If empty, use None instead
        if not translations:
            translations = None

        # Add translations for this language for that 
        subword_dict[target_lang].append(translations)

100%|███████████████████████████████████████████████████████████████████████| 734/734 [25:01<00:00,  2.05s/it]


In [19]:
relations['fr_subword_alias'] = subword_dict['fr']

#### Save

In [20]:
relations

Unnamed: 0,id,en,de,es,fr,count,en_alias,de_alias,es_alias,fr_alias,en_translate_alias,de_translate_alias,es_translate_alias,fr_translate_alias,en_subword_alias,de_subword_alias,es_subword_alias,fr_subword_alias
0,P3719,regulated by,reguliert von,regulado por,régi par,461,"[regulator, regulatory authority]",,[regulada por],"[régie par, régulé par, régulée par, réglement...",[governed by],[regiert durch],"[regulada por, gobernado por]",[régulée par],"[by, from, governed, regulated]","[regiert, durch, geregelt]","[regulado, desde, por, gobernado]","[à partir de, pour, réglementé, par]"
1,P1704,is pollinator of,Bestäuber von,polinizador,pollinisateur de,58,"[pollinator of, pollinates]",,,"[pollinise, plante pollinisée]","[pollinator of, pollinator]","[Bestäuber, ist Bestäuber von]","[es polinizador de, polinizador de]","[est pollinisateur de, pollinisateur]","[of, from, pollinator]","[von, Bestäuber, ist]","[de, desde, es, polinizador]","[pollinisateur, à partir de, de, est]"
2,P3173,offers view on,bietet Sicht auf,ofrece vista a,offre une vue sur,935,"[has view of, has views of, offers view of, vi...","[Sicht auf, Aussicht]","[tiene vista a, ofrece una vista de, ofrece vi...","[offre un panorama sur, vue sur, a vue sur, jo...","[offers view to, offers a view of]","[bietet Aussicht auf, bietet einen Ausblick auf]","[ofrece ver en, ofrece una vista de]",,"[to, view, offer, on, a, offers]","[bietet an, zu, an, ein, Ansicht, Angebot]","[seguro, oferta, en, vista, a, ofertas]","[au, pour, des offres, voir]"
3,P3085,qualifies for event,qualifiziert für Turnier,califica para el evento,qualifie pour,2770,[league this event qualifies for],,,[ligue à laquelle l'événement qualifie],"[qualified for tournament, qualify for the eve...","[qualifiziert sich für die Veranstaltung, sich...","[calificado para el torneo, califica para]","[se qualifier pour l'événement, qualifié pour ...","[event, qualifies, the, for, competition, qual...","[zum, Veranstaltung, qualifiziert, der]","[competencia, por, calificado, para, evento, c...","[se qualifie, la, pour, un événement, qualifié..."
4,P4646,representation of,Vorstellung von,representación de,représentation de,167,,,,,[idea of],[Repräsentation von],"[idea de, representacion de]","[idée de, la représentation de]","[representation, of, idea, from]","[von, Darstellung]","[desde, de, representación, idea]","[à partir de, de, représentation, idée]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,P5769,editor-in-chief,Chefredakteur,redactor en jefe,responsable éditorial,2248,"[executive editor, editor in chief, lead edito...",[Chefredakteurin],"[editor en jefe, editor ejecutivo, editor jefe]","[directeur de la rédaction, directrice de la r...","[editorial manager, chief editor]",[Redaktionsleiter],"[gerente editorial, editor en jefe]","[éditeur en chef, rédacteur en chef]","[boss, editorial, editor, on, responsible]","[Boss, verantwortlich, an, Editor, redaktionell]","[editorial, responsable]","[au, éditeur, chef]"
730,P8627,closing time,Schließzeit,hora de cierre,heure de fermeture,5,[closes at],"[geschlossen um, geschlossen ab, geöffnet bis]",[cierra a],[moment de fermeture],[closure hour],[Geschäftsschluss],,,"[of, time, closing, from, hour]","[Schließen, von, Zeit, Stunde]","[de, hora, clausura]","[heure, à partir de, fermeture, temps]"
731,P2770,source of income,Herkunft des Einkommens,fuente de ingreso,source de revenu,494,"[income source, source of revenue, revenue sou...",[Einkommensherkunft],[fuente de renta],,[Origin of Income],[Einkommensquelle],[Origen de los Ingresos],[Origine du revenu],"[of, from, origin, source, income, entry]","[Einkommen, von, Quelle, Eintrag]","[de, fuente, ingreso, origen]","[à partir de, de, la source, origine, le reven..."
732,P4794,season starts,Saisonstart,inicio de la temporada,début de la saison,2049,[season begins],[Saisonbeginn],[comienzo de la temporada],,[start of the season],"[Saison beginnt, Beginn der Saison]","[inicio de temporada, comienza la temporada]","[début de saison, la saison commence]","[of, beginning, season, from, the, start]","[von, Anfang, Jahreszeit, der, beginnt]","[inicio, los, de, empieza, temporada]","[début, départs, à partir de, saison, la]"


In [21]:
relations.to_json('properties_w_aliases_full.json')