### Multilingual Entity Generation Notebook
We take the entities by the number of occurences and do some preprocessing, essentially checking if the requirements of our entities are fulfilled and if the entities are in latin characters.

For this we use a MongoDB Database that has the entities and cleaned entity/property triples. We count the occurences of entities in the triples and then join the collection with the entity collection so that every entity is also associated with its occurence value.

---

This is a multilingual version of the original notebook.
Here we generate entities in english-centric way. So for choosing Chinese and Japanese, we get datasets: (en, zh) and (en, ja).

- single tokens

Cleanup:
- No numbers

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm

In [2]:
from pymongo import MongoClient
client = MongoClient()

In [3]:
db = client.wikidata
entityList = db.entityList

In [4]:
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'equivalence',
 'nsproperties',
 'inversion',
 'entity_subset',
 'antonym',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties',
 'symmetry']

##### Properties

In [4]:
# Unique properties
len(db.entityList.find_one({'value':{'$exists':True}}))

5

##### Which languages are represented the most in the dataset?

In [13]:
entityList.count_documents({'value':{'$exists':True}, 
                            'labels.en':{'$exists':True}, 
                            'labels.de':{'$exists':True},
                            'labels.fr':{'$exists':True},
                            'labels.es':{'$exists':True},
                            'labels.zh':{'$exists':True}, 
                           })

875581

In [14]:
entityList.count_documents({'value':{'$exists':True}, 
                            'labels.en':{'$exists':True}, 
                            'labels.de':{'$exists':True},
                            'labels.fr':{'$exists':True},
                            'labels.es':{'$exists':True},
                            'labels.ja':{'$exists':True}, 
                           })

791977

In [15]:
entityList.count_documents({'value':{'$exists':True}, 
                            'labels.en':{'$exists':True}, 
                            'labels.de':{'$exists':True},
                            'labels.fr':{'$exists':True},
                            'labels.es':{'$exists':True},
                            'labels.ko':{'$exists':True}, 
                           })

447043

In [16]:
entityList.count_documents({'value':{'$exists':True}, 
                            'labels.en':{'$exists':True}, 
                            'labels.de':{'$exists':True},
                            'labels.fr':{'$exists':True},
                            'labels.es':{'$exists':True},
                            'labels.ru':{'$exists':True}, 
                           })

1216431

In [None]:
entityList.count_documents({'value':{'$exists':True}, 
                             'labels.en':{'$exists':True},
                             'labels.de':{'$exists':True},
                             'labels.fr':{'$exists':True},
                             'labels.es':{'$exists':True},
                             'labels.zh':{'$exists':True},
                             'labels.ja':{'$exists':True}
                            })

In [29]:
entityList.find({'value':{'$exists':True}, 
                 'labels.en':{'$exists':True},
                 'labels.de':{'$exists':True},
                 'labels.fr':{'$exists':True},
                 'labels.es':{'$exists':True},
                 'labels.zh':{'$exists':True},
                 'labels.ja':{'$exists':True}
                })

577665

In [193]:
document = entityList.find_one({'id':'Q144'})
document

{'_id': ObjectId('62067319578a2e954363ce8c'),
 'value': 700,
 'id': 'Q144',
 'type': 'item',
 'labels': {'en': {'language': 'en', 'value': 'dog'},
  'fr': {'language': 'fr', 'value': 'chien'},
  'it': {'language': 'it', 'value': 'cane'},
  'de': {'language': 'de', 'value': 'Hund'},
  'ru': {'language': 'ru', 'value': 'собака'},
  'es': {'language': 'es', 'value': 'perro'},
  'af': {'language': 'af', 'value': 'hond'},
  'ar': {'language': 'ar', 'value': 'كلب'},
  'bg': {'language': 'bg', 'value': 'куче'},
  'bn': {'language': 'bn', 'value': 'কুকুর'},
  'el': {'language': 'el', 'value': 'σκύλος'},
  'et': {'language': 'et', 'value': 'koer'},
  'eu': {'language': 'eu', 'value': 'txakur'},
  'fa': {'language': 'fa', 'value': 'سگ'},
  'fi': {'language': 'fi', 'value': 'koira'},
  'he': {'language': 'he', 'value': 'כלב הבית'},
  'hi': {'language': 'hi', 'value': 'श्वान'},
  'hu': {'language': 'hu', 'value': 'kutya'},
  'id': {'language': 'id', 'value': 'anjing'},
  'ja': {'language': 'ja', '

#### Prepare mBERT

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [7]:
# Load mBERT model and Tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained('xlm-roberta-base')

In [8]:
tokenizer.tokenize('犬')

['▁', '犬']

#### Preprocessing

In [9]:
def isLatin(s):
    try:
        s.encode('latin-1')
    except UnicodeEncodeError:
        return False
    else:
        return True

In [10]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except AttributeError:
        return False
    except UnicodeDecodeError:
        return False
    else:
        return True

In [11]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

In [12]:
def all_same(items):
    return all(x == items[0] for x in items)

In [13]:
# Tests if the strings are the same, also tests for lower/uppercasing
def all_same2(items):
    items_lower = [item.lower() for item in items]
    return all(x == y for x in items_lower for y in items_lower)

#### Tokenize and Check amount of possible entities

In [14]:
languages = ['en', 'de', 'fr', 'es']

In [15]:
exist_value = {'$exists':True}
query = {'value': exist_value}

for lang in languages:
    query['labels.' + lang] = exist_value
    
query

{'value': {'$exists': True},
 'labels.en': {'$exists': True},
 'labels.de': {'$exists': True},
 'labels.fr': {'$exists': True},
 'labels.es': {'$exists': True}}

In [16]:
# Single-Token
# Candidataes English labels but labels exist in all languages
candidates = []

search = entityList.find(query)

for document in search:
    if all(k in document['labels'] for k in languages):
        wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]
        if (all(len(l) == 1 for l in wordTokens)):
            if document['labels'][languages[0]]['value'].isalpha() and isLatin(document['labels'][languages[0]]['value']) and len(document['labels'][languages[0]]['value']) > 1 and not all_same([document['labels'][k]['value'] for k in languages]):
                candidates.append({'id': document['id'], 'label': document['labels'][languages[0]]['value']})

In [19]:
len(candidates)

1250

In [20]:
candidates

[{'id': 'Q248371', 'label': 'COL'},
 {'id': 'Q307045', 'label': 'mail'},
 {'id': 'Q409797', 'label': 'FEL'},
 {'id': 'Q1761084', 'label': 'length'},
 {'id': 'Q58809888', 'label': 'between'},
 {'id': 'Q218181', 'label': 'CE'},
 {'id': 'Q224853', 'label': 'KR'},
 {'id': 'Q247976', 'label': 'studio'},
 {'id': 'Q255248', 'label': 'IST'},
 {'id': 'Q292419', 'label': 'Fil'},
 {'id': 'Q980612', 'label': 'gun'},
 {'id': 'Q1643175', 'label': 'Committee'},
 {'id': 'Q13537336', 'label': 'english'},
 {'id': 'Q218716', 'label': 'SV'},
 {'id': 'Q361692', 'label': 'Surf'},
 {'id': 'Q1409462', 'label': 'fire'},
 {'id': 'Q2669118', 'label': 'tani'},
 {'id': 'Q3482678', 'label': 'positive'},
 {'id': 'Q217502', 'label': 'MP'},
 {'id': 'Q254624', 'label': 'Micro'},
 {'id': 'Q283937', 'label': 'at'},
 {'id': 'Q350655', 'label': 'SRI'},
 {'id': 'Q398944', 'label': 'Gel'},
 {'id': 'Q446166', 'label': 'Raz'},
 {'id': 'Q483199', 'label': 'It'},
 {'id': 'Q1424205', 'label': 'ulu'},
 {'id': 'Q1570723', 'label': 

**Runs consecutive tests on list of languages, queries them from the database and prints the length - ENGLISH CENTRIC**

In [21]:
language_list = [['en', 'zh'], ['en', 'ja'], ['en', 'ko'], ['en', 'ru']]

for languages in language_list:
    
    # Create query for MongoDB
    exist_value = {'$exists':True}
    query = {'value': exist_value}

    for lang in languages:
        query['labels.' + lang] = exist_value

    # Run Query and gather list of entity candidates (Single Token + More than 1 letter and no numbers)
    candidates = []

    search = entityList.find(query)
    
    # Language to test if not number (has to be european)
    primary = languages[0]

    for document in search:
        if all(k in document['labels'] for k in languages):
            wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]
            if (all(len(l) == 1 for l in wordTokens)):
                # Get test value in test lang
                test_value = document['labels'][primary]['value']
                if test_value.isalpha() and isLatin(test_value) and len(test_value) > 1 and not all_same([document['labels'][k]['value'] for k in languages]):
                    candidates.append({'id': document['id'], 'label': test_value})
                    
    print(len(candidates))

281
215
1311
3694


**Runs consecutive tests on list of languages, queries them from the database and prints the length - ASIAN CENTRIC**

To get rid of unicode characters hiding as entities, we also query for english. This shouldn't be too limiting since wikidata is primarily english.

In [23]:
language_list = [['ru', 'zh'], ['zh', 'ja'], ['ru', 'ja']]

for languages in language_list:
    
    # Create query for MongoDB
    exist_value = {'$exists':True}
    query = {'value': exist_value}

    for lang in languages:
        query['labels.' + lang] = exist_value
    
    query['labels.en'] = exist_value

    # Run Query and gather list of entity candidates (Single Token + More than 1 letter and no numbers)
    candidates = []

    search = entityList.find(query)
    
    # Language to test if not number (has to be european)
    primary = languages[0]

    for document in search:
        if all(k in document['labels'] for k in languages):
            wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]
            if (all(len(l) == 1 for l in wordTokens)):
                # Get test value in test lang
                test_value = document['labels']['en']['value']
                if test_value.isalpha() and isLatin(test_value) and len(test_value) > 1 and not all_same([document['labels'][k]['value'] for k in languages]):
                    candidates.append({'id': document['id'], 'label': test_value})
                    
    print(len(candidates))

301
346
292


#### Compute Language Pair Datasets and save them all in one

In [121]:
languages_list = [['en', 'de', 'fr', 'es']]

In [None]:
for languages in languages_list:
    entities = defaultdict(list)

    exist = {'$exists':True}
    query = {'value': exist}

    for lang in languages:
        query['labels.' + lang] = exist

    length = entityList.count_documents(query)
    search = entityList.find(query)

    for document in tqdm(search, total=length):
        if all(k in document['labels'] for k in languages):
            # Tokenize
            wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]

            # If all wordTokens are singular...
            if (all(len(l) == 1 for l in wordTokens)):

                # Takes first language (probably english)
                test_value = document['labels'][languages[0]]['value']

                # Test for edge cases
                if test_value.isalpha() and isLatin(test_value) and len(test_value) > 1:
                    entities['id'].append(document['id'])
                    for lang in languages:
                        entities[lang].append(document['labels'][lang]['value'])
    
    print("Saving...")
    df = pd.DataFrame(entities)
    df_wo_duplicates = df.drop_duplicates(subset=languages[0], keep='first')
    
    # Save as CSV for manual cleanup
    path = '../data/entities/SingleToken/xlm/'

    lang_path = ''
    for i, lang in enumerate(languages):
            lang_path += lang + '_'

    df_wo_duplicates.to_csv(path + lang_path + 'entities.csv', index=False)

  0%|          | 0/3087705 [00:00<?, ?it/s]

And the same slightly different for Asian languages:

In [48]:
languages_list = [['ru', 'zh'], ['zh', 'ja'], ['ru', 'ja']]

In [49]:
dfs = []

for languages in languages_list:
    entities = defaultdict(list)

    exist = {'$exists':True}
    query = {'value': exist}

    for lang in languages:
        query['labels.' + lang] = exist
        
    query['labels.en'] = exist_value

    length = entityList.count_documents(query)
    search = entityList.find(query)

    for document in tqdm(search, total=length):
        if all(k in document['labels'] for k in languages):
            # Tokenize
            wordTokens = [tokenizer.tokenize(document['labels'][k]['value']) for k in languages]

            # If all wordTokens are singular...
            if (all(len(l) == 1 for l in wordTokens)):

                # Takes first language (probably english)
                test_value = document['labels']['en']['value']

                # Test for edge cases
                if test_value.isalpha() and isLatin(test_value) and len(test_value) > 1:
                    entities['id'].append(document['id'])
                    for lang in languages:
                        entities[lang].append(document['labels'][lang]['value'])
    
    df = pd.DataFrame(entities)
    dfs.append(df)

  0%|          | 0/875905 [00:00<?, ?it/s]

  0%|          | 0/992013 [00:00<?, ?it/s]

  0%|          | 0/868509 [00:00<?, ?it/s]

#### Postprocess and Save

In [107]:
# Entity Candidates to Dataframe
df = pd.DataFrame(entities)
print(df)

             id        en zh
0      Q1155980        se  瑟
1      Q1363153     cream  霜
2       Q980612       gun  棍
3      Q1078168        to  斗
4      Q1759242       Shi  詩
...         ...       ... ..
1275       Q532   village  村
1276  Q14349455  membrane  膜
1277      Q7432   species  种
1278   Q6581072    female  女
1279   Q6581097      male  男

[1280 rows x 3 columns]


In [30]:
df

Unnamed: 0,id,ru,ja
0,Q218181,Ce,CE
1,Q218008,Sr,SR
2,Q218716,Sv,SV
3,Q224441,Fr,FR
4,Q342744,Игра,ゲーム
...,...,...,...
287,Q7432,вид,種
288,Q29,Испания,スペイン
289,Q159,Россия,ロシア
290,Q142,Франция,フランス


In [50]:
df = dfs[0]
languages = ['ru', 'zh']

In [51]:
# Remove caseinsensitive duplicates
df['ru_upper'] = df['ru'].astype(str).str.upper()
df = df.drop_duplicates(subset='ru_upper', keep='first')
df = df.drop(columns=['ru_upper'], axis=1)
df

Unnamed: 0,id,ru,zh
0,Q216756,DDR,DDR
1,Q217417,HF,HF
2,Q218887,SC,SC
3,Q224432,RMS,RMS
4,Q224763,ST,ST
...,...,...,...
879,Q8502,гора,山
881,Q907311,Netflix,Netflix
882,Q11446,судно,船
883,Q532,село,村


In [52]:
# Remove duplicate labels TODO: need to look at duplicate sin all languages!!!!
for lang in languages:
    df = df.drop_duplicates(subset=lang, keep='first')
df

Unnamed: 0,id,ru,zh
0,Q216756,DDR,DDR
1,Q217417,HF,HF
2,Q218887,SC,SC
3,Q224432,RMS,RMS
4,Q224763,ST,ST
...,...,...,...
877,Q2529610,Карл,Carl
879,Q8502,гора,山
881,Q907311,Netflix,Netflix
882,Q11446,судно,船


In [53]:
# Save as CSV for manual cleanup
path = '../data/entities/SingleToken/multilingual/'

lang_path = ''
for i, lang in enumerate(languages):
        lang_path += lang + '_'

df.to_csv(path + lang_path + 'entities2.csv', index=False)