### Multilingual labels for Symmetry and Equivalence

In [1]:
import pandas as pd
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [2]:
from pymongo import MongoClient
client = MongoClient()

In [3]:
db = client.wikidata
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'equivalence',
 'nsproperties',
 'inversion',
 'entity_subset',
 'antonym',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties',
 'symmetry']

In [4]:
db.properties.count_documents({})

1371

In [6]:
db.properties.find_one({})

{'_id': 'P1034'}

In [12]:
db.relation_subset.count_documents({})

1142

In [10]:
db.relation_subset.find_one({})

{'_id': ObjectId('6210c432cf55a123777d8516'),
 'id': 'P1034',
 'type': 'property',
 'labels': {'en': {'language': 'en', 'value': 'main food source'},
  'nb': {'language': 'nb', 'value': 'hovednæringskilde'},
  'ca': {'language': 'ca', 'value': "principal font d'alimentació"},
  'pt': {'language': 'pt', 'value': 'principal fonte alimentar'},
  'de': {'language': 'de', 'value': 'Hauptnahrungsquelle'},
  'it': {'language': 'it', 'value': 'principale fonte di nutrizione'},
  'hu': {'language': 'hu', 'value': 'fő élelemforrás'},
  'fr': {'language': 'fr', 'value': 'source principale de nourriture'},
  'pl': {'language': 'pl', 'value': 'główne źródło pożywienia'},
  'es': {'language': 'es', 'value': 'fuente de alimento principal'},
  'ru': {'language': 'ru', 'value': 'основной источник питания'},
  'fa': {'language': 'fa', 'value': 'منبع غذایی اصلی'},
  'nl': {'language': 'nl', 'value': 'belangrijkste voedselbron'},
  'br': {'language': 'br', 'value': 'Mammenn emvagañ bennañ'},
  'mk': {'lan

In [12]:
test  = db.triples.distinct('relation')

In [13]:
len(test)

1371

In [11]:
# Load them into MongoDB
# Keep id, type, labels

for prop in propertyList:
    # Get Property
    p_dict = get_entity_dict_from_api(prop)
    
    # Take only a subset of keys
    p_subset = dict((k, p_dict[k]) for k in ('id', 'type', 'labels'))
    
    # Insert into collection
    db.symmetric.insert_one(p_subset)

In [12]:
# TODO: Maybe also use less used relations?
db.symmetric.find_one({})

{'_id': ObjectId('6210fede67e09f0d729923b3'),
 'id': 'P530',
 'type': 'property',
 'labels': {'en': {'language': 'en', 'value': 'diplomatic relation'},
  'nl': {'language': 'nl', 'value': 'diplomatieke relatie'},
  'de': {'language': 'de', 'value': 'diplomatische Beziehung'},
  'fa': {'language': 'fa', 'value': 'رابطۀ دیپلماتیک'},
  'hu': {'language': 'hu', 'value': 'diplomáciai kapcsolat'},
  'ca': {'language': 'ca', 'value': 'relació diplomàtica'},
  'fr': {'language': 'fr', 'value': 'relation diplomatique'},
  'it': {'language': 'it', 'value': 'relazione diplomatica'},
  'sl': {'language': 'sl', 'value': 'diplomatski odnosi'},
  'es': {'language': 'es', 'value': 'relación diplomática'},
  'ru': {'language': 'ru', 'value': 'дипломатические отношения'},
  'nn': {'language': 'nn', 'value': 'diplomatisk samband'},
  'gu': {'language': 'gu', 'value': 'રાજદ્વારી સંબંધો'},
  'sr': {'language': 'sr', 'value': 'дипломатски односи'},
  'sr-ec': {'language': 'sr-ec', 'value': 'дипломатски одно

#### Utils

In [19]:
def isLatin(s):
    try:
        s.encode('latin-1')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [20]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [18]:
def all_same(items):
    return all(x == items[0] for x in items)

#### Processing

In [16]:
# Set multilingual labels for them
query_languages = ('en','es','fr','de')

In [21]:
# Only get relations that are available in query_languages
# Should not be the same in all languages
cand = []
candNon = []

for document in db.symmetric.find({}):
    if all(k in document['labels'] for k in query_languages):
        # Ignore relations that are the same in all languages
        labels = [document['labels'][k]['value'] for k in query_languages]
        if all_same(labels): continue
        
        # Sanity check if there are properties that are not in latin script or broken
        if not has_numbers(document['labels']['en']['value']) and isLatin(document['labels']['en']['value']):
            j = {'id': document['id']}
            for k in query_languages: j[k] = document['labels'][k]['value']
            cand.append(j)
        else:
            j = {'id': document['id']}
            for k in query_languages: j[k] = document['labels'][k]['value']
            candNon.append(j)

In [22]:
cand

[{'id': 'P530',
  'en': 'diplomatic relation',
  'es': 'relación diplomática',
  'fr': 'relation diplomatique',
  'de': 'diplomatische Beziehung'},
 {'id': 'P47',
  'en': 'shares border with',
  'es': 'comparte fronteras con',
  'fr': 'limitrophe de',
  'de': 'gemeinsame Grenze mit'},
 {'id': 'P461',
  'en': 'opposite of',
  'es': 'opuesto a',
  'fr': 'contraire',
  'de': 'Gegenteil von'},
 {'id': 'P460',
  'en': 'said to be the same as',
  'es': 'se dice que es lo mismo que',
  'fr': 'réputé identique à',
  'de': 'eventuell gleichwertig'},
 {'id': 'P3373',
  'en': 'sibling',
  'es': 'hermano o hermana',
  'fr': 'frère ou sœur',
  'de': 'Geschwister'},
 {'id': 'P26',
  'en': 'spouse',
  'es': 'cónyuge',
  'fr': 'conjoint',
  'de': 'Ehepartner(in)'},
 {'id': 'P7047',
  'en': 'enemy of',
  'es': 'enemigo de',
  'fr': 'ennemi de',
  'de': 'Feind von'},
 {'id': 'P451',
  'en': 'unmarried partner',
  'es': 'pareja',
  'fr': 'partenaire',
  'de': 'Lebenspartner/-in'},
 {'id': 'P112',
  'en':

In [25]:
candNon

[]

#### Postprocess and Save

In [23]:
# Entity Candidates to Dataframe
df = pd.DataFrame(cand)
df

Unnamed: 0,id,en,es,fr,de
0,P530,diplomatic relation,relación diplomática,relation diplomatique,diplomatische Beziehung
1,P47,shares border with,comparte fronteras con,limitrophe de,gemeinsame Grenze mit
2,P461,opposite of,opuesto a,contraire,Gegenteil von
3,P460,said to be the same as,se dice que es lo mismo que,réputé identique à,eventuell gleichwertig
4,P3373,sibling,hermano o hermana,frère ou sœur,Geschwister
...,...,...,...,...,...
214,P971,category combines topics,temas asociados por la categoría,sujets associés à la catégorie,Kategorie kombiniert die Themen
215,P797,authority,autoridad ejecutiva,autorité exécutive,Exekutive
216,P5869,model item,elemento ejemplo,élément Wikidata exemplaire,Beispielelement
217,P1304,central bank,banco central,banque centrale,Zentralbank (Land)


In [26]:
# Save as CSV for manual cleanup
df.to_csv('Relations/symmetric_multilingual.csv', index=False)

#### Equivalence

In [38]:
# Load relations
properties = pd.read_csv('Relations/equivalence_relations.csv')
properties = properties.rename(columns={'r1.id': 'id1', 'r2.id': 'id2'})
properties

Unnamed: 0,id1,id2
0,P150,P527
1,P361,P463
2,P47,P530
3,P2936,P37
4,P30,P361
...,...,...
7374,P17,P669
7375,P50,P943
7376,P123,P725
7377,P5056,P735


In [39]:
propertyList = properties.to_dict('list')
propertyList

{'id1': ['P150',
  'P361',
  'P47',
  'P2936',
  'P30',
  'P1412',
  'P119',
  'P119',
  'P119',
  'P103',
  'P103',
  'P1830',
  'P1830',
  'P20',
  'P19',
  'P551',
  'P101',
  'P2652',
  'P101',
  'P170',
  'P112',
  'P170',
  'P127',
  'P127',
  'P127',
  'P137',
  'P137',
  'P112',
  'P112',
  'P112',
  'P112',
  'P2184',
  'P2283',
  'P1589',
  'P1365',
  'P1889',
  'P1889',
  'P2283',
  'P676',
  'P279',
  'P1889',
  'P1889',
  'P1336',
  'P122',
  'P361',
  'P150',
  'P1344',
  'P35',
  'P1313',
  'P2012',
  'P186',
  'P150',
  'P1889',
  'P1622',
  'P1365',
  'P1365',
  'P27',
  'P30',
  'P131',
  'P20',
  'P1066',
  'P19',
  'P119',
  'P1344',
  'P180',
  'P279',
  'P279',
  'P19',
  'P47',
  'P155',
  'P155',
  'P156',
  'P1376',
  'P840',
  'P57',
  'P272',
  'P17',
  'P185',
  'P108',
  'P1066',
  'P1066',
  'P184',
  'P101',
  'P1411',
  'P17',
  'P31',
  'P131',
  'P1376',
  'P131',
  'P112',
  'P112',
  'P138',
  'P20',
  'P19',
  'P1376',
  'P1376',
  'P131',
  'P2789'

In [54]:
# Load them into MongoDB
# Keep id, type, labels

for rel1, rel2 in zip(propertyList['id1'], propertyList['id2']):
    # Get Property
    r1_dict = get_entity_dict_from_api(rel1)
    r2_dict = get_entity_dict_from_api(rel2)
    
    # Take only a subset of keys
    r1_subset = dict((k, r1_dict[k]) for k in ('id', 'labels'))
    r1_subset['id1'] = r1_subset.pop('id')
    r1_subset['labels1'] = r1_subset.pop('labels')

    r2_subset = dict((k, r2_dict[k]) for k in ('id', 'labels'))
    r2_subset['id2'] = r2_subset.pop('id')
    r2_subset['labels2'] = r2_subset.pop('labels')

    r_subset = {**r1_subset, **r2_subset}
    
    # Insert into collection
    db.equivalence.insert_one(r_subset)

In [56]:
db.equivalence.find_one({})

{'_id': ObjectId('621104eb67e09f0d72992496'),
 'id1': 'P150',
 'labels1': {'cs': {'language': 'cs', 'value': 'nižší správní celky'},
  'en': {'language': 'en',
   'value': 'contains administrative territorial entity'},
  'it': {'language': 'it', 'value': 'sottodivisioni amministrative'},
  'fr': {'language': 'fr',
   'value': 'contient les subdivisions territoriales administratives'},
  'zh-hans': {'language': 'zh-hans', 'value': '包含行政领土实体'},
  'zh-hant': {'language': 'zh-hant', 'value': '包含行政領土實體'},
  'pt-br': {'language': 'pt-br', 'value': 'subdivide-se em'},
  'uk': {'language': 'uk',
   'value': 'адміністративно-територіально поділяється на'},
  'eo': {'language': 'eo', 'value': 'subdividoj'},
  'ca': {'language': 'ca', 'value': 'conté la subdivisió administrativa'},
  'hr': {'language': 'hr', 'value': 'podjedinice'},
  'fa': {'language': 'fa', 'value': 'زیربخش'},
  'nds': {'language': 'nds', 'value': 'Ünnerafdelen'},
  'lv': {'language': 'lv', 'value': 'apakšiedalījumi'},
  'be': 

In [57]:
# Only get relations that are available in query_languages
# Should not be the same in all languages
cand = []
candNon = []

for document in db.equivalence.find({}):
    if all(k in document['labels1'] for k in query_languages) and all(k in document['labels2'] for k in query_languages):
        # Ignore relations that are the same in all languages
        labels1 = [document['labels1'][k]['value'] for k in query_languages]
        labels2 = [document['labels2'][k]['value'] for k in query_languages]
        if all_same(labels1) or all_same(labels2): continue
        
        # Sanity check if there are properties that are not in latin script or broken
        if not has_numbers(document['labels1']['en']['value']) and isLatin(document['labels1']['en']['value']) and not has_numbers(document['labels2']['en']['value']) and isLatin(document['labels2']['en']['value']):
            j = {'id1': document['id1']}
            for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
            j['id2'] = document['id2']
            for k in query_languages: j[k + '2'] = document['labels2'][k]['value']
            
            cand.append(j)
        else:
            j = {'id1': document['id1']}
            for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
            j['id2'] = document['id2']
            for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

            candNon.append(j)

In [58]:
cand

[{'id1': 'P150',
  'en1': 'contains administrative territorial entity',
  'es1': 'subdividido en (división administrativa)',
  'fr1': 'contient les subdivisions territoriales administratives',
  'de1': 'Untereinheit (administrative Einheit)',
  'id2': 'P527',
  'en2': 'has part',
  'es2': 'compuesto por',
  'fr2': 'comprend',
  'de2': 'besteht aus'},
 {'id1': 'P361',
  'en1': 'part of',
  'es1': 'forma parte de',
  'fr1': 'partie de',
  'de1': 'ist Teil von',
  'id2': 'P463',
  'en2': 'member of',
  'es2': 'miembro de',
  'fr2': 'membre de',
  'de2': 'Mitglied von'},
 {'id1': 'P47',
  'en1': 'shares border with',
  'es1': 'comparte fronteras con',
  'fr1': 'limitrophe de',
  'de1': 'gemeinsame Grenze mit',
  'id2': 'P530',
  'en2': 'diplomatic relation',
  'es2': 'relación diplomática',
  'fr2': 'relation diplomatique',
  'de2': 'diplomatische Beziehung'},
 {'id1': 'P2936',
  'en1': 'language used',
  'es1': 'idioma usado',
  'fr1': 'langue utilisée',
  'de1': 'genutzte Sprache',
  'id

In [60]:
len(cand)

7242

In [59]:
candNon

[]

In [61]:
# Entity Candidates to Dataframe
df = pd.DataFrame(cand)
df

Unnamed: 0,id1,en1,es1,fr1,de1,id2,en2,es2,fr2,de2
0,P150,contains administrative territorial entity,subdividido en (división administrativa),contient les subdivisions territoriales admini...,Untereinheit (administrative Einheit),P527,has part,compuesto por,comprend,besteht aus
1,P361,part of,forma parte de,partie de,ist Teil von,P463,member of,miembro de,membre de,Mitglied von
2,P47,shares border with,comparte fronteras con,limitrophe de,gemeinsame Grenze mit,P530,diplomatic relation,relación diplomática,relation diplomatique,diplomatische Beziehung
3,P2936,language used,idioma usado,langue utilisée,genutzte Sprache,P37,official language,idioma oficial,langue officielle,Amtssprache
4,P30,continent,continente,continent,Kontinent,P361,part of,forma parte de,partie de,ist Teil von
...,...,...,...,...,...,...,...,...,...,...
7237,P17,country,país,pays,Staat,P669,located on street or square,situado en la calle,voie,Straße oder Platz
7238,P50,author,autor,auteur,Autor,P943,programmer,programador,programmeur,Programmierer
7239,P123,publisher,editorial,éditeur,Verlag,P725,voice actor,actor de voz,comédien de doublage,Sprecher
7240,P5056,patronym or matronym for this person,patronímico o matronímico para esta persona,patronyme ou matronyme pour cette personne,Patronym oder Matronym dieser Person,P735,given name,nombre de pila,prénom,Vorname


In [62]:
# Save as CSV for manual cleanup
df.to_csv('Relations/equivalence_multilingual.csv', index=False)