### Antonym Entity Generation Notebook
Using the entity generation notebook and the wikidata query results that generated antonym to make a antonym csv with multiple language labels.

We take the entities by the number of occurences and do some preprocessing, essentially checking if the requirements of our entities are fulfilled and if the entities are in latin characters.

For this we use a MongoDB Database that has the entities and cleaned entity/property triples. We count the occurences of entities in the triples and then join the collection with the entity collection so that every entity is also associated with its occurence value.

Requirements:
- labels the same in en, fr, es, de
- single tokens

Cleanup:
- No non-ascii symbols
- No numbers

In [109]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [110]:
import pandas as pd
from pymongo import MongoClient
client = MongoClient()

In [111]:
db = client.wikidata

In [3]:
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'symmetric',
 'nsproperties',
 'inversion',
 'entity_subset',
 'equivalence',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties']

In [7]:
# Load relations
properties = pd.read_csv('antonyms.csv')
properties

Unnamed: 0,item1,item1Label,item2,item2Label
0,http://www.wikidata.org/entity/Q106521687,antiverse,http://www.wikidata.org/entity/Q1,universe
1,http://www.wikidata.org/entity/Q527,sky,http://www.wikidata.org/entity/Q2,Earth
2,http://www.wikidata.org/entity/Q4,death,http://www.wikidata.org/entity/Q3,life
3,http://www.wikidata.org/entity/Q3,life,http://www.wikidata.org/entity/Q4,death
4,http://www.wikidata.org/entity/Q14819852,birth,http://www.wikidata.org/entity/Q4,death
...,...,...,...,...
23088,http://www.wikidata.org/entity/Q2213986,overachievement,http://www.wikidata.org/entity/Q110687770,underachievement
23089,http://www.wikidata.org/entity/Q110693173,Spanish ambassador to the Austrian Empire,http://www.wikidata.org/entity/Q110693166,Ambassador of the Austrian Empire in Spain
23090,http://www.wikidata.org/entity/Q110693166,Ambassador of the Austrian Empire in Spain,http://www.wikidata.org/entity/Q110693173,Spanish ambassador to the Austrian Empire
23091,http://www.wikidata.org/entity/Q110693217,Ambassador of Spain in the Archduchy of Austria,http://www.wikidata.org/entity/Q110693205,Ambassador of the Archduchy of Austria in Spain


In [26]:
db.entityList.find_one({})

{'_id': ObjectId('6206731a578a2e954363d9da'),
 'id': 'Q28638',
 'type': 'item',
 'labels': {'ru': {'language': 'ru', 'value': 'Панславянские языки'},
  'en': {'language': 'en', 'value': 'Pan-Slavic language'},
  'de': {'language': 'de', 'value': 'Panslawische Sprache'},
  'es': {'language': 'es', 'value': 'paneslávico'},
  'fr': {'language': 'fr', 'value': 'langue pan-slaves'},
  'it': {'language': 'it', 'value': 'Lingue artificiali a base slava'},
  'zh': {'language': 'zh', 'value': '泛斯拉夫語言'},
  'hu': {'language': 'hu', 'value': 'pánszláv nyelv'},
  'pt': {'language': 'pt', 'value': 'língua pan-eslava'}}}

In [38]:
propertyList = properties.to_dict('list')
propertyList

{'item1': ['http://www.wikidata.org/entity/Q106521687',
  'http://www.wikidata.org/entity/Q527',
  'http://www.wikidata.org/entity/Q4',
  'http://www.wikidata.org/entity/Q3',
  'http://www.wikidata.org/entity/Q14819852',
  'http://www.wikidata.org/entity/Q18170708',
  'http://www.wikidata.org/entity/L291757',
  'http://www.wikidata.org/entity/Q1598829',
  'http://www.wikidata.org/entity/Q105839683',
  'http://www.wikidata.org/entity/Q55386374',
  'http://www.wikidata.org/entity/Q2647467',
  'http://www.wikidata.org/entity/Q454',
  'http://www.wikidata.org/entity/Q2114394',
  'http://www.wikidata.org/entity/Q3827340',
  'http://www.wikidata.org/entity/Q10728826',
  'http://www.wikidata.org/entity/Q20021125',
  'http://www.wikidata.org/entity/Q2310416',
  'http://www.wikidata.org/entity/Q10728827',
  'http://www.wikidata.org/entity/Q204',
  'http://www.wikidata.org/entity/Q204',
  'http://www.wikidata.org/entity/Q60981852',
  'http://www.wikidata.org/entity/Q3250296',
  'http://www.wikid

In [56]:
def numRelations(s):
    try:
        entry = db.entityList.find_one({'id':s})
        if entry is not None:
            entry['value']
    except KeyError:
        return 0
    else:
        return 0

In [81]:
for relation1, relation2 in zip(propertyList['item1'][22986:], propertyList['item2'][22986:]):
    # Get Property
    rel1 = relation1.rsplit('/', 1)[1]
    rel2 = relation2.rsplit('/', 1)[1]
    
    if (rel1[0] != 'P' or rel2[0] != 'P'): continue
    
    r1_dict = get_entity_dict_from_api(rel1)
    r2_dict = get_entity_dict_from_api(rel2)
    
    if r1_dict['type'] != 'item' or r2_dict['type'] != 'item': continue
    
    # Take only a subset of keys
    r1_subset = dict((k, r1_dict[k]) for k in ('id', 'labels'))
    r1_subset['id1'] = r1_subset.pop('id')
    r1_subset['labels1'] = r1_subset.pop('labels')

    r2_subset = dict((k, r2_dict[k]) for k in ('id', 'labels'))
    r2_subset['id2'] = r2_subset.pop('id')
    r2_subset['labels2'] = r2_subset.pop('labels')
    
    r_subset = {**r1_subset, **r2_subset}
    
    # Insert into collection
    db.antonym.insert_one(r_subset)

In [85]:
len(propertyList['item1'])

23093

In [87]:
db.antonym.count_documents({})

22983

In [101]:
db.antonym.find_one({})

{'_id': ObjectId('621141b55258e6dc526cbcef'),
 'id1': 'Q106521687',
 'labels1': {'pt': {'language': 'pt', 'value': 'antiverso'},
  'en': {'language': 'en', 'value': 'antiverse'},
  'ru': {'language': 'ru', 'value': 'антивселенная'},
  'sv': {'language': 'sv', 'value': 'antiversum'},
  'kw': {'language': 'kw', 'value': 'gorthbys'},
  'tt': {'language': 'tt', 'value': 'антигаләм'},
  'uk': {'language': 'uk', 'value': 'антивсесвіт'},
  'ja': {'language': 'ja', 'value': '反宇宙'}},
 'id2': 'Q1',
 'labels2': {'fr': {'language': 'fr', 'value': 'univers'},
  'la': {'language': 'la', 'value': 'universum'},
  'uz': {'language': 'uz', 'value': 'Olam'},
  'ru': {'language': 'ru', 'value': 'Вселенная'},
  'pl': {'language': 'pl', 'value': 'Wszechświat'},
  'nb': {'language': 'nb', 'value': 'universet'},
  'eo': {'language': 'eo', 'value': 'universo'},
  'it': {'language': 'it', 'value': 'universo'},
  'es': {'language': 'es', 'value': 'universo'},
  'de': {'language': 'de', 'value': 'Universum'},
  '

In [103]:
i = 0
for anton in db.antonym.find({}):
    print(i)
    i += 1
    q_id1 = anton['id1']
    q_id2 = anton['id2']
    entity1 = db.entityList.find_one({'id':q_id1})
    entity2 = db.entityList.find_one({'id':q_id2})
    db.antonym.update_one(
        {'id1':q_id1, 'id2':q_id2},
        {'$set': {'value': entity1['value'] + entity2['value']}}
    )


0
1
2
3
4
5


KeyboardInterrupt: 

#### Prepare mBERT

In [9]:
import torch
from transformers import BertTokenizer, BertModel

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [11]:
tokenizer.tokenize('bélier')

['bé', '##lier']

In [12]:
s = 'bélier'

#### Preprocessing

In [31]:
def isLatin(s):
    try:
        s.encode('latin-1')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [32]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except AttributeError:
        return False
    except UnicodeDecodeError:
        return False
    else:
        return True

In [33]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [34]:
def all_same(items):
    return all(x == items[0] for x in items)

#### Tokenize and Check

In [115]:
# Parameters
# Get all entities for maxNumRelations
maxNumRelations = 5
query_languages = ('en','es','fr','de')

In [119]:
# Single-Token
# Language Agnostic labels (same label across languages)
cand = []
candNon = []

for document in db.antonym.find({}):
    if all(k in document['labels1'] for k in query_languages) and all(k in document['labels2'] for k in query_languages):
        # Test if labels in all languages are the same
        labels1 = [document['labels1'][k]['value'] for k in query_languages]
        labels2 = [document['labels2'][k]['value'] for k in query_languages]
        if not all_same(labels1) or not all_same(labels2): continue
        
        # Tokenize the universal label
        wordToken1 = tokenizer.tokenize(document['labels1']['en']['value'])
        wordToken2 = tokenizer.tokenize(document['labels2']['en']['value'])
        if (len(wordToken1) == 1 and len(wordToken2) == 1):
            if document['labels1']['en']['value'].isalpha() and isLatin(document['labels1']['en']['value']) and document['labels2']['en']['value'].isalpha() and isLatin(document['labels2']['en']['value']):
                j = {'id1': document['id1']}
                for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
                j['id2'] = document['id2']
                for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

                cand.append(j)
            else:
                j = {'id1': document['id1']}
                for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
                j['id2'] = document['id2']
                for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

                candNon.append(j)

In [121]:
len(cand)

42

In [122]:
len(candNon)

12

In [126]:
# Single-Token
# English labels but labels exist in all languages
cand = []
candNon = []

for document in db.antonym.find({}):
    if all(k in document['labels1'] for k in query_languages) and all(k in document['labels2'] for k in query_languages):
        
        # Tokenize label
        wordToken1 = tokenizer.tokenize(document['labels1']['en']['value'])
        wordToken2 = tokenizer.tokenize(document['labels2']['en']['value'])
        if (len(wordToken1) == 1 and len(wordToken2) == 1):
            if document['labels1']['en']['value'].isalpha() and isLatin(document['labels1']['en']['value']) and document['labels2']['en']['value'].isalpha() and isLatin(document['labels2']['en']['value']):
                j = {'id1': document['id1']}
                j['en1'] = document['labels1']['en']['value']
                j['id2'] = document['id2']
                j['en2'] = document['labels2']['en']['value']

                cand.append(j)
            else:
                j = {'id1': document['id1']}
                j['en1'] = document['labels1']['en']['value']
                j['id2'] = document['id2']
                j['en2'] = document['labels2']['en']['value']

                candNon.append(j)

In [135]:
len(cand)

107

In [128]:
cand

[{'id1': 'Q527', 'en1': 'sky', 'id2': 'Q2', 'en2': 'Earth'},
 {'id1': 'Q4', 'en1': 'death', 'id2': 'Q3', 'en2': 'life'},
 {'id1': 'Q3', 'en1': 'life', 'id2': 'Q4', 'en2': 'death'},
 {'id1': 'Q14819852', 'en1': 'birth', 'id2': 'Q4', 'en2': 'death'},
 {'id1': 'Q454', 'en1': 'peace', 'id2': 'Q198', 'en2': 'war'},
 {'id1': 'Q204', 'en1': 'zero', 'id2': 'Q204', 'en2': 'zero'},
 {'id1': 'Q192630', 'en1': 'past', 'id2': 'Q344', 'en2': 'future'},
 {'id1': 'Q198', 'en1': 'war', 'id2': 'Q454', 'en2': 'peace'},
 {'id1': 'Q8441', 'en1': 'man', 'id2': 'Q467', 'en2': 'woman'},
 {'id1': 'Q2', 'en1': 'Earth', 'id2': 'Q527', 'en2': 'sky'},
 {'id1': 'Q667', 'en1': 'south', 'id2': 'Q659', 'en2': 'north'},
 {'id1': 'Q679', 'en1': 'west', 'id2': 'Q659', 'en2': 'north'},
 {'id1': 'Q684', 'en1': 'east', 'id2': 'Q659', 'en2': 'north'},
 {'id1': 'Q659', 'en1': 'north', 'id2': 'Q667', 'en2': 'south'},
 {'id1': 'Q679', 'en1': 'west', 'id2': 'Q667', 'en2': 'south'},
 {'id1': 'Q684', 'en1': 'east', 'id2': 'Q667', 

In [142]:
# Single-Token
# All language labels
cand = []
candNon = []

for document in db.antonym.find({}):
    if all(k in document['labels1'] for k in query_languages) and all(k in document['labels2'] for k in query_languages):
        
        # Tokenize label
        wordTokens1 = [tokenizer.tokenize(document['labels1'][k]['value']) for k in query_languages]
        wordTokens2 = [tokenizer.tokenize(document['labels2'][k]['value']) for k in query_languages]
        
        if (all(len(l) == 1 for l in wordTokens1) and all(len(l) == 1 for l in wordTokens2)):
            if document['labels1']['en']['value'].isalpha() and isLatin(document['labels1']['en']['value']) and document['labels2']['en']['value'].isalpha() and isLatin(document['labels2']['en']['value']):
                j = {'id1': document['id1']}
                for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
                j['id2'] = document['id2']
                for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

                cand.append(j)
            else:
                j = {'id1': document['id1']}
                for k in query_languages: j[k + '1'] = document['labels1'][k]['value']
                j['id2'] = document['id2']
                for k in query_languages: j[k + '2'] = document['labels2'][k]['value']

                candNon.append(j)

In [143]:
len(cand)

107

In [144]:
cand

[{'id1': 'Q527',
  'en1': 'sky',
  'es1': 'cielo',
  'fr1': 'ciel',
  'de1': 'Himmel',
  'id2': 'Q2',
  'en2': 'Earth',
  'es2': 'Tierra',
  'fr2': 'Terre',
  'de2': 'Erde'},
 {'id1': 'Q4',
  'en1': 'death',
  'es1': 'muerte',
  'fr1': 'mort',
  'de1': 'Tod',
  'id2': 'Q3',
  'en2': 'life',
  'es2': 'vida',
  'fr2': 'vie',
  'de2': 'Leben'},
 {'id1': 'Q3',
  'en1': 'life',
  'es1': 'vida',
  'fr1': 'vie',
  'de1': 'Leben',
  'id2': 'Q4',
  'en2': 'death',
  'es2': 'muerte',
  'fr2': 'mort',
  'de2': 'Tod'},
 {'id1': 'Q14819852',
  'en1': 'birth',
  'es1': 'nacimiento',
  'fr1': 'naissance',
  'de1': 'Geburt',
  'id2': 'Q4',
  'en2': 'death',
  'es2': 'muerte',
  'fr2': 'mort',
  'de2': 'Tod'},
 {'id1': 'Q454',
  'en1': 'peace',
  'es1': 'paz',
  'fr1': 'paix',
  'de1': 'Frieden',
  'id2': 'Q198',
  'en2': 'war',
  'es2': 'guerra',
  'fr2': 'guerre',
  'de2': 'Krieg'},
 {'id1': 'Q192630',
  'en1': 'past',
  'es1': 'pasado',
  'fr1': 'passé',
  'de1': 'Vergangenheit',
  'id2': 'Q344',
  

### Postprocess and Save

In [136]:
import pandas as pd

In [145]:
# Entity Candidates to Dataframe
df = pd.DataFrame(cand)
df

Unnamed: 0,id1,en1,es1,fr1,de1,id2,en2,es2,fr2,de2
0,Q527,sky,cielo,ciel,Himmel,Q2,Earth,Tierra,Terre,Erde
1,Q4,death,muerte,mort,Tod,Q3,life,vida,vie,Leben
2,Q3,life,vida,vie,Leben,Q4,death,muerte,mort,Tod
3,Q14819852,birth,nacimiento,naissance,Geburt,Q4,death,muerte,mort,Tod
4,Q454,peace,paz,paix,Frieden,Q198,war,guerra,guerre,Krieg
...,...,...,...,...,...,...,...,...,...,...
102,Q29043257,José,José,José,José,Q29043256,José,José,José,José
103,Q29043256,José,José,José,José,Q29043257,José,José,José,José
104,Q1300961,Augustine,Augustine,Augustine,Augustine,Q54617672,Augustine,Augustine,Augustine,Augustine
105,Q18563993,Michal,Michal,Michal,Michal,Q104771451,Michal,Michal,Michal,Michal


In [141]:
# Remove duplicate labels
df_wo_duplicates = df.drop_duplicates(subset='en1', keep='first')
df_wo_duplicates

Unnamed: 0,id1,en1,es1,fr1,de1,id2,en2,es2,fr2,de2
0,Q527,sky,cielo,ciel,Himmel,Q2,Earth,Tierra,Terre,Erde
1,Q4,death,muerte,mort,Tod,Q3,life,vida,vie,Leben
2,Q3,life,vida,vie,Leben,Q4,death,muerte,mort,Tod
3,Q14819852,birth,nacimiento,naissance,Geburt,Q4,death,muerte,mort,Tod
4,Q454,peace,paz,paix,Frieden,Q198,war,guerra,guerre,Krieg
...,...,...,...,...,...,...,...,...,...,...
98,Q24759380,remain,permanecer,rester,Pro,Q24759450,leave,salir,quitter,Contra
100,Q28962312,often,frecuentemente,souvent,häufig,Q28962310,rarely,raramente,rarement,kaum
101,Q28962310,rarely,raramente,rarement,kaum,Q28962312,often,frecuentemente,souvent,häufig
102,Q29043257,José,José,José,José,Q29043256,José,José,José,José


In [146]:
# Save as CSV for manual cleanup
df_wo_duplicates.to_csv('SingleToken/antonym_multilingual_uncleaned.csv', index=False)