In [186]:
import pycurl

INFOBOXPROPS_URL = 'http://downloads.dbpedia.org/2016-10/core-i18n/en/infobox_properties_en.ttl.bz2'
LABELS_URL = 'http://downloads.dbpedia.org/2016-10/core-i18n/en/labels_en.ttl.bz2'


def download_tofile(url, filename=None):
    if filename is None:
        filename = url.split('/')[-1]
    with open(filename, 'wb') as f:
        c = pycurl.Curl()
        c.setopt(c.VERBOSE, True)
        c.setopt(c.URL, url)
        c.setopt(c.WRITEDATA, f)
        c.perform()
        c.close()

download_tofile(INFOBOXPROPS_URL)
download_tofile(LABELS_URL)
print('done')

In [98]:
PRETRAINEDVECTORS_URL = 'https://googledrive.com/host/0B7XkCwpI5KDYeFdmcVltWkhtbmM'
PRETRAINEDVECTORS = 'freebase-vectors-skipgram1000-en.bin.gz'

download_tofile(PRETRAINEDVECTORS_URL, PRETRAINEDVECTORS)
print('done')

In [4]:
import pandas as pd
import bz2

INFOBOXPROPS = 'infobox_properties_en.ttl.bz2'
LABELS = 'labels_en.ttl.bz2'


def bz2_todf(filename, usecols=[0,1,2], colnames=['subject', 'property', 'object']):
    with bz2.open(filename, mode='rt') as f:
        data = pd.read_csv(f, sep=' ', header=None, skiprows=[0, -1],
                           usecols=usecols, names=colnames)
        return data


infoboxprops = bz2_todf(INFOBOXPROPS)
labelled = bz2_todf(LABELS, usecols=[0,2], colnames=['subject', 'label'])
print('done')

In [1]:
property_list = infoboxprops.property.drop_duplicates().apply(
                    lambda x: x.split('/')[-1][:-1]).tolist()

In [22]:
from gensim.models import KeyedVectors

PRETRAINEDVECTORS = 'freebase-vectors-skipgram1000-en.bin.gz'
model = KeyedVectors.load_word2vec_format(PRETRAINEDVECTORS, binary=True)

In [131]:
import re
import numpy as np


def format_DBPtoFreebase(label):
    # converts 'Hari Om (2016 film)@en' into '/en/hari_om'
    cropped = re.sub(r"\(.*\)", "", label[:-3]).strip()
    formatted = '/en/' + cropped.lower().replace(' ', '_')
    return formatted


def format_FreebasetoTitle(label):
    # converts '/en/hari_om' into 'Hari Om'
    formatted = label[4:].replace('_', ' ').title()
    return formatted


def format_results_towords(ranked_tuples):
    return [format_FreebasetoTitle(t[0]) for t in ranked_tuples]


def get_labels(p, n_basis=1000):
    ending = '/' + p + '>'
    matching = pd.Series([ending in row for row in infoboxprops['property']])
    objs = infoboxprops[matching]['subject'].drop_duplicates()
    labels = labelled[labelled['subject'].isin(objs)].label
    return labels


def get_w2v(word):
    try:
        return model.wv[word]
    except:
        return None


def similar_by_property(prop, n_results=10, n_basis=1000, justwords=False):
    labels = get_labels(prop, n_basis=n_basis)
    formatted_labels = labels.apply(format_DBPtoFreebase)
    vectors = formatted_labels.apply(get_w2v).dropna()
    pdcentroid = vectors.mean()
    results = model.wv.similar_by_vector(pdcentroid, n_results)
    if justwords == True:
        return format_results_towords(results)
    else:
        return results

In [132]:
m = 'manufacturer'
p = 'director'
q = 'author'
r = 'capital'
s = 'population'

results = similar_by_property(q)
print('Results for ' + q, *results, sep='\n')

author:
('/en/william_hope_hodgson', 0.6058804392814636)
('/en/tanith_lee', 0.6036504507064819)
('/en/fritz_leiber', 0.6027674078941345)
('/en/the_king_in_yellow', 0.6027647852897644)
('/en/lisa_goldstein', 0.5980621576309204)
('/en/the_big_time', 0.5964199900627136)
('/en/the_book_of_lost_tales', 0.5920685529708862)
('/en/a_merritt', 0.5906498432159424)
('/en/g_w_dahlquist', 0.5890225172042847)
('/en/dorothy_dunnett', 0.5878594517707825)
