In [1]:
import pandas as pd
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

In [2]:
from pymongo import MongoClient
client = MongoClient()

In [3]:
db = client.wikidata
db.list_collection_names()

['entity2_triples',
 'entity_counts',
 'triplesInvert',
 'triples',
 'equivalence',
 'nsproperties',
 'inversion',
 'entity_subset',
 'antonym',
 'relation_subset',
 'entityList',
 'entity1_triples',
 'properties',
 'symmetry']

In [5]:
db.symmetry.find_one({})

{'_id': ObjectId('6213b9f506657e78f384d1f9'), 'id': 'P530'}

In [7]:
db.relation_subset.find_one({'id': 'P530'})

In [17]:
db.relation_subset.count_documents({})

1142

In [6]:
db.relation_subset.find_one({})

{'_id': ObjectId('6210c432cf55a123777d8516'),
 'id': 'P1034',
 'type': 'property',
 'labels': {'en': {'language': 'en', 'value': 'main food source'},
  'nb': {'language': 'nb', 'value': 'hovednæringskilde'},
  'ca': {'language': 'ca', 'value': "principal font d'alimentació"},
  'pt': {'language': 'pt', 'value': 'principal fonte alimentar'},
  'de': {'language': 'de', 'value': 'Hauptnahrungsquelle'},
  'it': {'language': 'it', 'value': 'principale fonte di nutrizione'},
  'hu': {'language': 'hu', 'value': 'fő élelemforrás'},
  'fr': {'language': 'fr', 'value': 'source principale de nourriture'},
  'pl': {'language': 'pl', 'value': 'główne źródło pożywienia'},
  'es': {'language': 'es', 'value': 'fuente de alimento principal'},
  'ru': {'language': 'ru', 'value': 'основной источник питания'},
  'fa': {'language': 'fa', 'value': 'منبع غذایی اصلی'},
  'nl': {'language': 'nl', 'value': 'belangrijkste voedselbron'},
  'br': {'language': 'br', 'value': 'Mammenn emvagañ bennañ'},
  'mk': {'lan

#### Find out how much relations there are...

In [5]:
db.relation_subset.count_documents({
    'labels.en':{'$exists':True},
    'labels.zh':{'$exists':True}
})

0

In [49]:
db.relation_subset.count_documents({
    'labels.en':{'$exists':True},
    'labels.ja':{'$exists':True}
})

857

In [50]:
db.relation_subset.count_documents({
    'labels.en':{'$exists':True},
    'labels.ru':{'$exists':True}
})

1007

#### Generate relations

In [9]:
from collections import defaultdict

In [23]:
language_list = [['ru', 'zh'], ['ru', 'ja'], ['zh', 'ja']]

In [24]:
for languages in language_list:
    # Query
    exist_value = {'$exists':True}
    query = {}

    for lang in languages:
        query['labels.' + lang] = exist_value
    
    # Save in dict
    propertyList = defaultdict(list)
    
    for doc in db.relation_subset.find(query):
        propertyList['id'].append(doc['id'])
        for lang in languages:
            propertyList[lang].append(doc['labels'][lang]['value'])
        propertyList['en'].append(doc['labels']['en']['value'])
        
    print("Saving...")
    df = pd.DataFrame(propertyList)
    
    for lang in languages:
        df = df.drop_duplicates(subset=lang, keep='first')
    
    # Save as CSV for manual cleanup
    path = '../data/knowledge/'

    lang_path = ''
    for i, lang in enumerate(languages):
            lang_path += lang + '_'

    df.to_csv(path + lang_path + 'relations_en.csv', index=False)

Saving...
Saving...
Saving...


In [22]:
for i in propertyList:
    print(len(propertyList[i]))

670
670
1340


In [21]:
propertyList

defaultdict(list,
            {'id': ['P1034',
              'P5008',
              'P457',
              'P532',
              'P740',
              'P1817',
              'P1425',
              'P122',
              'P58',
              'P1629',
              'P2876',
              'P2597',
              'P9072',
              'P1716',
              'P1465',
              'P1165',
              'P4043',
              'P6889',
              'P8839',
              'P3080',
              'P141',
              'P7167',
              'P1951',
              'P2554',
              'P66',
              'P2853',
              'P7163',
              'P3275',
              'P7938',
              'P786',
              'P106',
              'P636',
              'P85',
              'P2841',
              'P1811',
              'P631',
              'P1158',
              'P4608',
              'P103',
              'P908',
              'P37',
              'P1435',
              'P689',
       