In [13]:
import pandas as pd
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [3]:
from transformers import TrainingArguments, DataCollatorForLanguageModeling, IntervalStrategy

In [3]:
# Load entities
entities = pd.read_csv('../data/entities/SingleToken/entities_languageAgnostic_clean.csv')

# Load relations
relations = pd.read_json('../data/knowledge/properties_w_aliases_full_cleaned.json')

In [91]:
entities = pd.read_csv('../data/entities/SingleToken/multilingual/en_ja.csv')
entities

Unnamed: 0.1,Unnamed: 0,id,en,ja
0,0,Q230634,Asia,„Ç¢„Ç∏„Ç¢
1,1,Q383842,Africa,„Ç¢„Éï„É™„Ç´
2,2,Q4742725,America,„Ç¢„É°„É™„Ç´
3,3,Q1572339,Album,„Ç¢„É´„Éê„É†
4,4,Q233567,Italia,„Ç§„Çø„É™„Ç¢
...,...,...,...,...
550,550,Q10931275,Mo,Â¢®
551,551,Q1204499,Ding,Èºé
552,552,Q2751034,mouse,Èº†
553,553,Q7363,nose,Èºª


In [92]:
entities = entities.drop(['Unnamed: 0'], axis=1)
entities

Unnamed: 0,id,en,ja
0,Q230634,Asia,„Ç¢„Ç∏„Ç¢
1,Q383842,Africa,„Ç¢„Éï„É™„Ç´
2,Q4742725,America,„Ç¢„É°„É™„Ç´
3,Q1572339,Album,„Ç¢„É´„Éê„É†
4,Q233567,Italia,„Ç§„Çø„É™„Ç¢
...,...,...,...
550,Q10931275,Mo,Â¢®
551,Q1204499,Ding,Èºé
552,Q2751034,mouse,Èº†
553,Q7363,nose,Èºª


In [93]:
entities.to_csv('../data/entities/SingleToken/multilingual/en_ja.csv', index=False)

In [7]:
p_dict = get_entity_dict_from_api('Q144')
p_dict['labels']

{'en': {'language': 'en', 'value': 'dog'},
 'fr': {'language': 'fr', 'value': 'chien'},
 'it': {'language': 'it', 'value': 'cane'},
 'nb': {'language': 'nb', 'value': 'tamhund'},
 'be-tarask': {'language': 'be-tarask', 'value': '—Å–∞–±–∞–∫–∞'},
 'yue': {'language': 'yue', 'value': 'Áãó'},
 'sgs': {'language': 'sgs', 'value': '≈†ova'},
 'nan': {'language': 'nan', 'value': 'K√°u'},
 'nds-nl': {'language': 'nds-nl', 'value': 'Hond'},
 'vro': {'language': 'vro', 'value': 'Pini'},
 'de': {'language': 'de', 'value': 'Hund'},
 'br': {'language': 'br', 'value': 'ki'},
 'ru': {'language': 'ru', 'value': '—Å–æ–±–∞–∫–∞'},
 'hif': {'language': 'hif', 'value': 'Kuttaa'},
 'es': {'language': 'es', 'value': 'perro'},
 'ab': {'language': 'ab', 'value': '–ê–ª–∞'},
 'ace': {'language': 'ace', 'value': 'As√®√´'},
 'af': {'language': 'af', 'value': 'hond'},
 'am': {'language': 'am', 'value': '·ãç·àª'},
 'an': {'language': 'an', 'value': 'Canis lupus familiaris'},
 'arc': {'language': 'arc', 'value': '‹ü‹†

In [9]:
p_dict['labels']['zh-hans']['value']

'Áä¨'

**Load mBERT**

In [14]:
import torch
from transformers import BertTokenizerFast, BertForMaskedLM

In [15]:
# Load mBERT model and Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained("bert-base-multilingual-cased")

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
tokenizer.tokenize(p_dict['labels']['zh-hant']['value'])

['Ê≤ñ', 'Áπ©']

In [5]:
relations

Unnamed: 0,id,en,de,es,fr,count,en_alias,de_alias,es_alias,fr_alias,en_translate_alias,de_translate_alias,es_translate_alias,fr_translate_alias,en_subword_alias,de_subword_alias,es_subword_alias,fr_subword_alias
0,P3719,regulated by,reguliert von,regulado por,r√©gi par,461,"[regulator, regulatory authority]",,[regulada por],"[r√©gie par, r√©gul√© par, r√©gul√©e par, r√©glement...",[governed by],[regiert durch],"[regulada por, gobernado por]",[r√©gul√©e par],"[by, from, governed, regulated]","[regiert, durch, geregelt]","[regulado, desde, por, gobernado]","[√† partir de, pour, r√©glement√©, par]"
1,P1704,is pollinator of,Best√§uber von,polinizador,pollinisateur de,58,"[pollinator of, pollinates]",,,"[pollinise, plante pollinis√©e]","[pollinator of, pollinator]","[Best√§uber, ist Best√§uber von]","[es polinizador de, polinizador de]","[est pollinisateur de, pollinisateur]","[of, from, pollinator]","[von, Best√§uber, ist]","[de, desde, es, polinizador]","[pollinisateur, √† partir de, de, est]"
2,P3173,offers view on,bietet Sicht auf,ofrece vista a,offre une vue sur,935,"[has view of, has views of, offers view of, vi...","[Sicht auf, Aussicht]","[tiene vista a, ofrece una vista de, ofrece vi...","[offre un panorama sur, vue sur, a vue sur, jo...","[offers view to, offers a view of]","[bietet Aussicht auf, bietet einen Ausblick auf]","[ofrece ver en, ofrece una vista de]",,"[to, view, offer, on, a, offers]","[bietet an, zu, an, ein, Ansicht, Angebot]","[seguro, oferta, en, vista, a, ofertas]","[au, pour, des offres, voir]"
3,P3085,qualifies for event,qualifiziert f√ºr Turnier,califica para el evento,qualifie pour,2770,[league this event qualifies for],,,[ligue √† laquelle l'√©v√©nement qualifie],"[qualified for tournament, qualify for the eve...","[qualifiziert sich f√ºr die Veranstaltung, sich...","[calificado para el torneo, califica para]","[se qualifier pour l'√©v√©nement, qualifi√© pour ...","[event, qualifies, the, for, competition, qual...","[zum, Veranstaltung, qualifiziert, der]","[competencia, por, calificado, para, evento, c...","[se qualifie, la, pour, un √©v√©nement, qualifi√©..."
4,P4646,representation of,Vorstellung von,representaci√≥n de,repr√©sentation de,167,,,,,[idea of],[Repr√§sentation von],"[idea de, representacion de]","[id√©e de, la repr√©sentation de]","[representation, of, idea, from]","[von, Darstellung]","[desde, de, representaci√≥n, idea]","[√† partir de, de, repr√©sentation, id√©e]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,P5769,editor-in-chief,Chefredakteur,redactor en jefe,responsable √©ditorial,2248,"[executive editor, editor in chief, lead edito...",[Chefredakteurin],"[editor en jefe, editor ejecutivo, editor jefe]","[directeur de la r√©daction, directrice de la r...","[editorial manager, chief editor]",[Redaktionsleiter],"[gerente editorial, editor en jefe]","[√©diteur en chef, r√©dacteur en chef]","[boss, editorial, editor, on, responsible]","[Boss, verantwortlich, an, Editor, redaktionell]","[editorial, responsable]","[au, √©diteur, chef]"
730,P8627,closing time,Schlie√üzeit,hora de cierre,heure de fermeture,5,[closes at],"[geschlossen um, geschlossen ab, ge√∂ffnet bis]",[cierra a],[moment de fermeture],[closure hour],[Gesch√§ftsschluss],,,"[of, time, closing, from, hour]","[Schlie√üen, von, Zeit, Stunde]","[de, hora, clausura]","[heure, √† partir de, fermeture, temps]"
731,P2770,source of income,Herkunft des Einkommens,fuente de ingreso,source de revenu,494,"[income source, source of revenue, revenue sou...",[Einkommensherkunft],[fuente de renta],,[Origin of Income],[Einkommensquelle],[Origen de los Ingresos],[Origine du revenu],"[of, from, origin, source, income, entry]","[Einkommen, von, Quelle, Eintrag]","[de, fuente, ingreso, origen]","[√† partir de, de, la source, origine, le reven..."
732,P4794,season starts,Saisonstart,inicio de la temporada,d√©but de la saison,2049,[season begins],[Saisonbeginn],[comienzo de la temporada],,[start of the season],"[Saison beginnt, Beginn der Saison]","[inicio de temporada, comienza la temporada]","[d√©but de saison, la saison commence]","[of, beginning, season, from, the, start]","[von, Anfang, Jahreszeit, der, beginnt]","[inicio, los, de, empieza, temporada]","[d√©but, d√©parts, √† partir de, saison, la]"


In [8]:
dict((k, p_dict[k]) for k in ('id', 'type', 'labels'))

{'id': 'P8005',
 'type': 'property',
 'labels': {'en': {'language': 'en',
   'value': 'research subject recruitment status'},
  'mk': {'language': 'mk', 'value': '—Å–æ—Å—Ç–æ—ò–±–∞ –Ω–∞ –ø—Ä–∏–±–∏—Ä–∞—ö–µ—Ç–æ –Ω–∞ –∏—Å–ø–∏—Ç–∞–Ω–∏—Ü–∏'},
  'sr': {'language': 'sr', 'value': '—Å—Ç–∞—Ç—É—Å —Ä–µ–≥—Ä—É—Ç–æ–≤–∞—ö–∞ —Å—É–±—ò–µ–∫—Ç–∞ –∏—Å—Ç—Ä–∞–∂–∏–≤–∞—ö–∞'},
  'nl': {'language': 'nl',
   'value': 'aanwervingsstatus van het onderzoeksonderwerp'},
  'ca': {'language': 'ca', 'value': 'estat de contractaci√≥ de la recerca'},
  'be-tarask': {'language': 'be-tarask',
   'value': '—Å—Ç–∞—Ç—É—Å –Ω–∞–±–æ—Ä—É –¥–∞—Å—å–ª–µ–¥–Ω—ñ—Ü–∫–∞–≥–∞ –ø—Ä–∞–¥–º–µ—Ç—É'},
  'tr': {'language': 'tr', 'value': 'ara≈ütƒ±rma konusu i≈üe alƒ±m durumu'},
  'uk': {'language': 'uk', 'value': '—Å—Ç–∞—Ç—É—Å –Ω–∞–±–æ—Ä—É –æ–±‚Äô—î–∫—Ç–∞ –¥–æ—Å–ª—ñ–¥–∂–µ–Ω–Ω—è'},
  'fr': {'language': 'fr',
   'value': 'statut de recrutement des sujets de recherche'},
  'bn': {'language': 'bn', 'value': '‡¶ó‡¶¨‡ßá‡¶∑‡¶£‡¶æ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶®‡¶ø‡¶Ø

In [None]:
p = WikidataProperty(p_dict)

In [23]:
properties = pd.read_csv('properties_notsymmetric.csv')  

In [24]:
propertyList = properties.to_dict('list')['_id']
propertyList

['P1034',
 'P5008',
 'P457',
 'P8004',
 'P7442',
 'P532',
 'P740',
 'P1817',
 'P1425',
 'P122',
 'P58',
 'P5249',
 'P1629',
 'P2876',
 'P2597',
 'P9072',
 'P3085',
 'P1704',
 'P8263',
 'P1716',
 'P3173',
 'P4099',
 'P1465',
 'P1165',
 'P741',
 'P4043',
 'P5110',
 'P6889',
 'P3719',
 'P8839',
 'P4646',
 'P3823',
 'P3772',
 'P8652',
 'P4329',
 'P8546',
 'P3080',
 'P534',
 'P4323',
 'P141',
 'P7167',
 'P1951',
 'P1035',
 'P2554',
 'P2358',
 'P9887',
 'P3774',
 'P66',
 'P8006',
 'P2853',
 'P7163',
 'P3275',
 'P7938',
 'P786',
 'P106',
 'P636',
 'P85',
 'P2841',
 'P1811',
 'P10067',
 'P631',
 'P9325',
 'P5588',
 'P1158',
 'P4608',
 'P103',
 'P908',
 'P5995',
 'P37',
 'P6563',
 'P1435',
 'P689',
 'P568',
 'P2502',
 'P3158',
 'P2453',
 'P3402',
 'P9813',
 'P1855',
 'P2614',
 'P38',
 'P183',
 'P5996',
 'P163',
 'P5841',
 'P4805',
 'P556',
 'P6902',
 'P4988',
 'P6948',
 'P2992',
 'P2438',
 'P3491',
 'P282',
 'P291',
 'P2548',
 'P6275',
 'P20',
 'P3680',
 'P5202',
 'P2184',
 'P3027',
 'P411',
 '

In [25]:
cand = []

for prop in propertyList:
    p_dict = get_entity_dict_from_api(prop)
    p = WikidataProperty(p_dict)
    p_label = p.get_label()
    cand.append({'id': prop, 'label': p_label})

In [26]:
df = pd.DataFrame(cand)
df

Unnamed: 0,id,label
0,P1034,main food source
1,P5008,on focus list of Wikimedia project
2,P457,foundational text
3,P8004,ordered by
4,P7442,glacier status
...,...,...
1137,P8627,closing time
1138,P7152,copyright exemption
1139,P2770,source of income
1140,P4794,season starts


In [27]:
# Save as CSV for manual cleanup
df.to_csv('property_notsymmetric_labeled.csv', index=False)