### Prepare Inversion Relations from wikidata query to mongoDB to relation list

also used for equivalence

In [1]:
import pandas as pd
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [2]:
from pymongo import MongoClient
client = MongoClient()

In [3]:
db = client.wikidata
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'symmetric',
 'nsproperties',
 'inversion',
 'entity_subset',
 'antonym',
 'equivalence',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties']

In [6]:
# Load relations
inversion = pd.read_csv('../data/Relations/inversion_relation_candidates.csv')
inversion

Unnamed: 0,item1,item1Label,item2,item2Label
0,http://www.wikidata.org/entity/P1535,used by,http://www.wikidata.org/entity/P2283,uses
1,http://www.wikidata.org/entity/P1411,nominated for,http://www.wikidata.org/entity/P2453,nominee
2,http://www.wikidata.org/entity/P3148,repeals,http://www.wikidata.org/entity/P2568,repealed by
3,http://www.wikidata.org/entity/P2579,studied by,http://www.wikidata.org/entity/P2578,studies
4,http://www.wikidata.org/entity/P2682,is verso of,http://www.wikidata.org/entity/P2681,is recto of
5,http://www.wikidata.org/entity/P770,cause of destruction,http://www.wikidata.org/entity/P3082,destroyed
6,http://www.wikidata.org/entity/P425,field of this occupation,http://www.wikidata.org/entity/P3095,practiced by
7,http://www.wikidata.org/entity/P3730,next higher rank,http://www.wikidata.org/entity/P3729,next lower rank
8,http://www.wikidata.org/entity/P7376,worn by,http://www.wikidata.org/entity/P3828,wears
9,http://www.wikidata.org/entity/P144,based on,http://www.wikidata.org/entity/P4969,derivative work


In [7]:
inversionList = inversion.to_dict('list')
inversionList

{'item1': ['http://www.wikidata.org/entity/P1535',
  'http://www.wikidata.org/entity/P1411',
  'http://www.wikidata.org/entity/P3148',
  'http://www.wikidata.org/entity/P2579',
  'http://www.wikidata.org/entity/P2682',
  'http://www.wikidata.org/entity/P770',
  'http://www.wikidata.org/entity/P425',
  'http://www.wikidata.org/entity/P3730',
  'http://www.wikidata.org/entity/P7376',
  'http://www.wikidata.org/entity/P144',
  'http://www.wikidata.org/entity/P5135',
  'http://www.wikidata.org/entity/P1027',
  'http://www.wikidata.org/entity/P7904',
  'http://www.wikidata.org/entity/P8032',
  'http://www.wikidata.org/entity/P1308',
  'http://www.wikidata.org/entity/P22',
  'http://www.wikidata.org/entity/P25',
  'http://www.wikidata.org/entity/P8810',
  'http://www.wikidata.org/entity/P1830',
  'http://www.wikidata.org/entity/P156',
  'http://www.wikidata.org/entity/P453',
  'http://www.wikidata.org/entity/P1299',
  'http://www.wikidata.org/entity/P184',
  'http://www.wikidata.org/entity/P

In [8]:
db.inversion.find_one({})

{'_id': ObjectId('6211121842d84bceca31e7c5'),
 'id1': 'P1535',
 'labels1': {'en': {'language': 'en', 'value': 'used by'},
  'fr': {'language': 'fr', 'value': 'utilisé par'},
  'nb': {'language': 'nb', 'value': 'brukes av'},
  'nl': {'language': 'nl', 'value': 'gebruikt door'},
  'mk': {'language': 'mk', 'value': 'користено од'},
  'ru': {'language': 'ru', 'value': 'находится в пользовании у'},
  'be': {'language': 'be', 'value': 'выкарыстоўваецца'},
  'de': {'language': 'de', 'value': 'verwendet von'},
  'sr': {'language': 'sr', 'value': 'коришћено од'},
  'sr-ec': {'language': 'sr-ec', 'value': 'коришћено од'},
  'sr-el': {'language': 'sr-el', 'value': 'korišćeno od'},
  'zh-hans': {'language': 'zh-hans', 'value': '使用者'},
  'el': {'language': 'el', 'value': 'χρησιμοποιείται από'},
  'sv': {'language': 'sv', 'value': 'används av'},
  'cs': {'language': 'cs', 'value': 'uživatel'},
  'hu': {'language': 'hu', 'value': 'használó'},
  'scn': {'language': 'scn', 'value': 'adupiratu di'},
  '

In [10]:
# Load inversion relations from wikidata query into mongoDB
for relation1, relation2 in zip(propertyList['item1'], propertyList['item2']):
    # Get Property
    rel1 = relation1.rsplit('/', 1)[1]
    rel2 = relation2.rsplit('/', 1)[1]
    
    r1_dict = get_entity_dict_from_api(rel1)
    r2_dict = get_entity_dict_from_api(rel2)
    
    # Take only a subset of keys
    r1_subset = dict((k, r1_dict[k]) for k in ('id', 'labels'))
    r1_subset['id1'] = r1_subset.pop('id')
    r1_subset['labels1'] = r1_subset.pop('labels')

    r2_subset = dict((k, r2_dict[k]) for k in ('id', 'labels'))
    r2_subset['id2'] = r2_subset.pop('id')
    r2_subset['labels2'] = r2_subset.pop('labels')

    r_subset = {**r1_subset, **r2_subset}
    
    # Insert into collection
    db.inversion.insert_one(r_subset)

In [9]:
db.inversion.count_documents({})

57

#### Preprocess

In [10]:
def isLatin(s):
    try:
        s.encode('latin-1')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [11]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [12]:
def all_same(items):
    return all(x == items[0] for x in items)

#### Process

In [25]:
db.inversion.find_one({})

{'_id': ObjectId('6211121842d84bceca31e7c5'),
 'id1': 'P1535',
 'labels1': {'en': {'language': 'en', 'value': 'used by'},
  'fr': {'language': 'fr', 'value': 'utilisé par'},
  'nb': {'language': 'nb', 'value': 'brukes av'},
  'nl': {'language': 'nl', 'value': 'gebruikt door'},
  'mk': {'language': 'mk', 'value': 'користено од'},
  'ru': {'language': 'ru', 'value': 'находится в пользовании у'},
  'be': {'language': 'be', 'value': 'выкарыстоўваецца'},
  'de': {'language': 'de', 'value': 'verwendet von'},
  'sr': {'language': 'sr', 'value': 'коришћено од'},
  'sr-ec': {'language': 'sr-ec', 'value': 'коришћено од'},
  'sr-el': {'language': 'sr-el', 'value': 'korišćeno od'},
  'zh-hans': {'language': 'zh-hans', 'value': '使用者'},
  'el': {'language': 'el', 'value': 'χρησιμοποιείται από'},
  'sv': {'language': 'sv', 'value': 'används av'},
  'cs': {'language': 'cs', 'value': 'uživatel'},
  'hu': {'language': 'hu', 'value': 'használó'},
  'scn': {'language': 'scn', 'value': 'adupiratu di'},
  '

In [13]:
# Only get relations that are available in query_languages
# Should not be the same in all languages
query_languages = ('en','es','fr','de')
cand = []
candNon = []

for document in db.inversion.find({}):
    if all(k in document['labels1'] for k in query_languages) and all(k in document['labels2'] for k in query_languages):
        # Ignore relations that are the same in all languages
        labels1 = [document['labels1'][k]['value'] for k in query_languages]
        labels2 = [document['labels2'][k]['value'] for k in query_languages]
        if all_same(labels1) or all_same(labels2): continue
        
        # Sanity check if there are properties that are not in latin script or broken
        if not has_numbers(document['labels1']['en']['value']) and isLatin(document['labels1']['en']['value']) and not has_numbers(document['labels2']['en']['value']) and isLatin(document['labels2']['en']['value']):
            j = {'id1': document['id1']}
            for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
            j['id2'] = document['id2']
            for k in query_languages: j[k + '2'] = document['labels2'][k]['value']
            
            cand.append(j)
        else:
            j = {'id1': document['id1']}
            for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
            j['id2'] = document['id2']
            for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

            candNon.append(j)

In [14]:
# Entity Candidates to Dataframe
df = pd.DataFrame(cand)
df

Unnamed: 0,id1,en1,es1,fr1,de1,id2,en2,es2,fr2,de2
0,P1535,used by,utilizado por,utilisé par,verwendet von,P2283,uses,utiliza,utilise,benutzt
1,P1411,nominated for,nominado a,nommé pour,nominiert für,P2453,nominee,nominado,nommé,Nominierter
2,P3148,repeals,revoca,abroge,hebt auf,P2568,repealed by,abrogado por,abrogé par,aufgehoben durch
3,P2579,studied by,estudiado por,discipline dont c'est l'objet,Forschungsthema in,P2578,studies,estudia,objet de la discipline,studiert
4,P2682,is verso of,es verso de,est le verso,ist Rückseite von,P2681,is recto of,es recto de,est le recto,ist Vorderseite von
5,P770,cause of destruction,causa de destrucción,cause de la destruction,Grund der Zerstörung,P3082,destroyed,ha destruido,détruit,zerstört
6,P425,field of this occupation,ámbito de la ocupación,domaine d'occupation,Betätigungsfeld des Berufes,P3095,practiced by,practicado por,pratiqué par,praktiziert von
7,P3730,next higher rank,rango inmediatamente superior,rang immédiatement supérieur,nächsthöherer Rang,P3729,next lower rank,rango inmediatamente inferior,rang immédiatement inférieur,nächstniedrigerer Rang
8,P144,based on,basado en,basé sur,basiert auf,P4969,derivative work,obra derivada,œuvre dérivée,Bearbeitung
9,P5135,greater than,mayor que,supérieur à,größer als,P5136,less than,menor que,inférieur à,weniger als


In [17]:
# Save as CSV for manual cleanup
df.to_csv('../data/Relations/inversion_multilingual.csv', index=False)