### Entity Generation Notebook
We take the entities by the number of occurences and do some preprocessing, essentially checking if the requirements of our entities are fulfilled and if the entities are in latin characters.

For this we use a MongoDB Database that has the entities and cleaned entity/property triples. We count the occurences of entities in the triples and then join the collection with the entity collection so that every entity is also associated with its occurence value.

Requirements:
- labels the same in en, fr, es, de
- single tokens

Cleanup:
- No non-ascii symbols
- No numbers

In [1]:
from pymongo import MongoClient
client = MongoClient()

In [2]:
db = client.wikidata
entityList = db.entityList

In [3]:
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'equivalence',
 'nsproperties',
 'inversion',
 'entity_subset',
 'antonym',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties',
 'symmetry']

##### Properties

In [29]:
db.triplesInvert.find_one({})

{'_id': ObjectId('62078f16162b2abbc195d3d0'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q43'}

In [28]:
db.nsproperties.find_one({'_id': 'P530'})

In [10]:
db.entity2_triples.count_documents({})

3871818

In [20]:
db.entity2_triples.find_one({})

{'_id': 'Q10000000', 'count': 1}

In [22]:
db.entity_counts.find_one({'_id': 'Q10000000'})

{'_id': 'Q10000000', 'value': 3}

In [323]:
db.symmetric.find_one({})

{'_id': ObjectId('620fb94eae35c2d89a319a2a'), 'r1': {'id': 'P530'}}

In [332]:
# Unique properties
len(db.triples.distinct('relation'))

1371

In [364]:
# List of non-symmetric properties
pipeline = [
    {
        '$lookup':
          {
            'from': "symmetric",
            'localField': "_id",
            'foreignField': "r1.id",
            'as': "matched_relations"
          }
    },
    {
      '$match': {
        "matched_relations": { '$eq': [] }
      }
    },
    { '$out':'nsproperties' }
]

db.properties.aggregate(pipeline, allowDiskUse=True)

<pymongo.command_cursor.CommandCursor at 0x7f0541e74c40>

In [371]:
db.nsproperties.find_one({})

{'_id': 'P1034'}

In [370]:
# Delete matched_relations
db.nsproperties.update_many({}, {'$unset':{"matched_relations":1}})

<pymongo.results.UpdateResult at 0x7f0559de4e40>

##### Playing around

In [9]:
db.entityList.find_one({'value':{'$exists':True}})

{'_id': ObjectId('62067319578a2e954363cfc4'),
 'value': 1,
 'id': 'Q3276',
 'type': 'item',
 'labels': {'fr': {'language': 'fr', 'value': 'exploration de Jupiter'},
  'en': {'language': 'en', 'value': 'exploration of Jupiter'},
  'de': {'language': 'de', 'value': 'Erforschung des Jupiters'},
  'es': {'language': 'es', 'value': 'exploración de Júpiter'},
  'it': {'language': 'it', 'value': 'esplorazione di Giove'},
  'pt': {'language': 'pt', 'value': 'exploração de Júpiter'},
  'ru': {'language': 'ru',
   'value': 'Исследование Юпитера межпланетными аппаратами'},
  'vi': {'language': 'vi', 'value': 'Thăm dò Sao Mộc'},
  'zh': {'language': 'zh', 'value': '木星探測'},
  'id': {'language': 'id', 'value': 'penjelajahan Yupiter'},
  'ja': {'language': 'ja', 'value': '木星探査'},
  'ar': {'language': 'ar', 'value': 'استكشاف كوكب المشتري'},
  'ko': {'language': 'ko', 'value': '목성 탐사'}}}

In [14]:
db.equivalence.find_one({})

{'_id': ObjectId('621104eb67e09f0d72992496'),
 'id1': 'P150',
 'labels1': {'cs': {'language': 'cs', 'value': 'nižší správní celky'},
  'en': {'language': 'en',
   'value': 'contains administrative territorial entity'},
  'it': {'language': 'it', 'value': 'sottodivisioni amministrative'},
  'fr': {'language': 'fr',
   'value': 'contient les subdivisions territoriales administratives'},
  'zh-hans': {'language': 'zh-hans', 'value': '包含行政领土实体'},
  'zh-hant': {'language': 'zh-hant', 'value': '包含行政領土實體'},
  'pt-br': {'language': 'pt-br', 'value': 'subdivide-se em'},
  'uk': {'language': 'uk',
   'value': 'адміністративно-територіально поділяється на'},
  'eo': {'language': 'eo', 'value': 'subdividoj'},
  'ca': {'language': 'ca', 'value': 'conté la subdivisió administrativa'},
  'hr': {'language': 'hr', 'value': 'podjedinice'},
  'fa': {'language': 'fa', 'value': 'زیربخش'},
  'nds': {'language': 'nds', 'value': 'Ünnerafdelen'},
  'lv': {'language': 'lv', 'value': 'apakšiedalījumi'},
  'be': 

In [27]:
db.antonym.count_documents({})

15094

In [3]:
db.triples.count_documents({})

67108576

In [7]:
db.triples.find_one({})

{'_id': ObjectId('62078f16162b2abbc195d3d0'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q43'}

In [8]:
db.triples.find_one({'relation':'P1696'})

{'_id': ObjectId('62078f16162b2abbc195d3d0'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q43'}

In [4]:
entityList.count_documents({'value':{'$exists':False}})

4500

In [24]:
entityList.count_documents({'value':{'$exists':True}})

10156980

In [91]:
entityList.count_documents({'value':{'$eq':1}})

428282

In [93]:
entityList.count_documents({'value':{'$exists': False}})

4500

In [112]:
entityList.count_documents({'value':{'$lte':3}})

2985115

In [106]:
db.triples.count_documents({'$or': [ { 'entity1': 'Q27213'}, { 'entity2': 'Q27213'} ] } )

1

In [31]:
entityList.find_one({"value":10})

{'_id': ObjectId('62067319578a2e954363cf10'),
 'value': 10,
 'id': 'Q1672',
 'type': 'item',
 'labels': {'it': {'language': 'it', 'value': 'Dejen Gebremeskel'},
  'en': {'language': 'en', 'value': 'Dejen Gebremeskel'},
  'fr': {'language': 'fr', 'value': 'Dejen Gebremeskel'},
  'de': {'language': 'de', 'value': 'Dejen Gebremeskel'},
  'fi': {'language': 'fi', 'value': 'Dejen Gebremeskel'},
  'es': {'language': 'es', 'value': 'Dejen Gebremeskel'},
  'ru': {'language': 'ru', 'value': 'Деджен Гебремескель'},
  'nl': {'language': 'nl', 'value': 'Dejen Gebremeskel'},
  'ja': {'language': 'ja', 'value': 'デジェン・ゲブレメスケル'},
  'tr': {'language': 'tr', 'value': 'Dejen Gebremeskel'},
  'hu': {'language': 'hu', 'value': 'Dejen Gebremeskel'},
  'fa': {'language': 'fa', 'value': 'دیژن گبرمسکل'},
  'he': {'language': 'he', 'value': "דג'ן גברמסקל"},
  'pt': {'language': 'pt', 'value': 'Dejen Gebremeskel'}}}

In [25]:
db.triples.find_one()

{'_id': ObjectId('62078f16162b2abbc195d3d0'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q43'}

In [26]:
pipeline = [ {"$match": {}}, 
             {"$out": "triplesInvert"},
]
db.triples.aggregate(pipeline)

<pymongo.command_cursor.CommandCursor at 0x7fcd8f7a38e0>

In [46]:
db.triplesInvert.find_one()

{'_id': ObjectId('62078f16162b2abbc195d3d0'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q43'}

In [37]:
pipeline = [
    {
        '$lookup':
          {
            'from': "triplesInvert",
            'localField': "entity1",
            'foreignField': "entity2",
            'as': "out"
          }
    }
]

results = db.triples.aggregate(pipeline, allowDiskUse=True)

In [42]:
result = results.next()

In [45]:
result

{'_id': ObjectId('62078f16162b2abbc195d3d2'),
 'entity1': 'Q31',
 'relation': 'P530',
 'entity2': 'Q77',
 'out': [{'_id': ObjectId('62078f16162b2abbc195d406'),
   'entity1': 'Q31',
   'relation': 'P17',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f16162b2abbc195d558'),
   'entity1': 'Q1868',
   'relation': 'P27',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f16162b2abbc195d7b8'),
   'entity1': 'Q148',
   'relation': 'P530',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f16162b2abbc195dbc7'),
   'entity1': 'Q183',
   'relation': 'P530',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f16162b2abbc195dcd1'),
   'entity1': 'Q183',
   'relation': 'P47',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f16162b2abbc195e695'),
   'entity1': 'Q801',
   'relation': 'P530',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f17162b2abbc195f927'),
   'entity1': 'Q1693',
   'relation': 'P205',
   'entity2': 'Q31'},
  {'_id': ObjectId('62078f17162b2abbc196256d'),
   'entity1': 'Q7318',
   'relation':

#### Prepare mBERT

In [4]:
import torch
from transformers import BertTokenizer, BertModel

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [6]:
tokenizer.tokenize('bélier')

['bé', '##lier']

#### Preprocessing

In [176]:
def isLatin(s):
    try:
        s.encode('latin-1')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [87]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except AttributeError:
        return False
    except UnicodeDecodeError:
        return False
    else:
        return True

In [16]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [225]:
def all_same(items):
    return all(x == items[0] for x in items)

#### Tokenize and Check

In [289]:
# Parameters
# Get all entities for maxNumRelations
maxNumRelations = 5
languages = ('en','es','fr','de')

In [290]:
# Single-Token
# Language Agnostic labels (same label across languages)
cand = []
candNon = []

for document in entityList.find({'value':{'$lte':maxNumRelations}}):
    if all(k in document['labels'] for k in languages):
        # Test if labels in all languages are the same
        labels = [document['labels'][k]['value'] for k in languages]
        if not all_same(labels): continue
        
        # Tokenize the universal label
        wordToken = tokenizer.tokenize(document['labels']['en']['value'])
        if (len(wordToken) == 1):
            if document['labels']['en']['value'].isalpha() and isLatin(document['labels']['en']['value']):
                cand.append({'id': document['id'], 'label': document['labels']['en']['value']})
            else:
                candNon.append({'id': document['id'], 'label': document['labels']['en']['value']})

In [277]:
# Single-Token
# English labels but labels exist in all languages
cand = []
candNon = []

for document in entityList.find({'value':{'$lte':maxNumRelations}}):
    if all(k in document['labels'] for k in languages):
        wordToken = tokenizer.tokenize(document['labels']['en']['value'])
        if (len(wordToken) == 1):
            if document['labels']['en']['value'].isalpha() and isLatin(document['labels']['en']['value']):
                cand.append({'id': document['id'], 'label': document['labels']['en']['value']})
            else:
                candNon.append({'id': document['id'], 'label': document['labels']['en']['value']})

In [254]:
# Single-Token
# All language labels
cand = []
candNon = []

for document in entityList.find({'value':{'$lte':maxNumRelations}}):
    if all(k in document['labels'] for k in languages):
        wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]
        if (all(len(l) == 1 for l in wordTokens)):
            if all(word[0].isalpha() and isLatin(word[0]) for word in wordTokens):
                j = {'id': document['id']}
                for k in languages: j[k] = document['labels'][k]['value']
                cand.append(j)
            else:
                j = {'id': document['id']}
                for k in languages: j[k] = document['labels'][k]['value']
                candNon.append(j)

In [291]:
len(cand)

9934

In [292]:
cand

[{'id': 'Q27278', 'label': 'H'},
 {'id': 'Q27931', 'label': 'Okinawa'},
 {'id': 'Q30011', 'label': 'K'},
 {'id': 'Q127942', 'label': 'JR'},
 {'id': 'Q138698', 'label': 'Khmer'},
 {'id': 'Q171494', 'label': 'Winston'},
 {'id': 'Q216756', 'label': 'DDR'},
 {'id': 'Q217417', 'label': 'HF'},
 {'id': 'Q219240', 'label': 'AK'},
 {'id': 'Q224432', 'label': 'RMS'},
 {'id': 'Q224763', 'label': 'ST'},
 {'id': 'Q225140', 'label': 'SF'},
 {'id': 'Q225202', 'label': 'IPA'},
 {'id': 'Q224949', 'label': 'LP'},
 {'id': 'Q225590', 'label': 'CT'},
 {'id': 'Q226388', 'label': 'RF'},
 {'id': 'Q226675', 'label': 'EF'},
 {'id': 'Q226684', 'label': 'NH'},
 {'id': 'Q229045', 'label': 'ITF'},
 {'id': 'Q229602', 'label': 'Sister'},
 {'id': 'Q230009', 'label': 'SNP'},
 {'id': 'Q229997', 'label': 'Moravia'},
 {'id': 'Q231449', 'label': 'Satan'},
 {'id': 'Q233378', 'label': 'RTL'},
 {'id': 'Q233713', 'label': 'FCC'},
 {'id': 'Q233980', 'label': 'Bali'},
 {'id': 'Q234404', 'label': 'Marathon'},
 {'id': 'Q236040', '

### Postprocess and Save

In [217]:
import pandas as pd

In [293]:
# Entity Candidates to Dataframe
df = pd.DataFrame(cand)
print(dataframe)

              id    label
0         Q27213  Formula
1         Q27278        H
2         Q27931  Okinawa
3         Q30011        K
4        Q127942       JR
...          ...      ...
8187  Q106120386    Order
8188  Q106763844    Front
8189  Q107145221   Alonso
8190  Q108271743      Mem
8191  Q108492091    Tartu

[8192 rows x 2 columns]


In [294]:
# Remove duplicate labels
df_wo_duplicates = df.drop_duplicates(subset='label', keep='first')
df_wo_duplicates

Unnamed: 0,id,label
0,Q27278,H
1,Q27931,Okinawa
2,Q30011,K
3,Q127942,JR
4,Q138698,Khmer
...,...,...
9924,Q104592425,Luigi
9925,Q106118969,Körper
9928,Q107441724,Drug
9930,Q107443100,Irina


In [295]:
df_wo_duplicates.loc[df['label'] == 'Alice']

Unnamed: 0,id,label
1986,Q3611661,Alice


In [1]:
# Save as CSV for manual cleanup
df_wo_duplicates.to_csv('SingleToken/entities_languageAgnostic_uncleaned.csv', index=False)

NameError: name 'df' is not defined