In [1]:
import pandas as pd
import numpy as np
import pickle
import pdb

In [2]:
def transductor(ot_data):
    ret = []
    ot_h = set(ot_data['h'].unique().tolist())
    ot_t = set(ot_data['t'].unique().tolist())
    ot_bridge = ot_h & ot_t
    ot_dict = ot_data.groupby('h')['t'].apply(lambda x: x.tolist()).to_dict()
    ot_data_list = ot_data.values.tolist()
    for h, t in ot_data_list:
        ret.append([h, t])
        if t in ot_bridge:
            for tt in ot_dict[t]:
                ret.append([h, tt])
    return pd.DataFrame(ret, columns=['h', 't']).drop_duplicates()

def transductor_master(ot_data):
    mid = transductor(ot_data)
    length = len(mid)
    for i in range(10):
        mid = transductor(mid)
        if len(mid) != length:
            length = len(mid)
        else:
            return mid
        
def read_ot():
    ot_data = []
    with open('../../data/DBpedia/raw/dbpedia_2016-10.nt') as f:
        for line in f:
            line = line.split(' ')[:3]
            if 'subClassOf' in line[1]:
                ot_data.append([line[0], line[2]])
    ot_data = pd.DataFrame(ot_data, columns=['h', 't'])
    ot_data = transductor_master(ot_data).reset_index(drop=True)
    return ot_data

In [3]:
def read_is():
    is_data = []
    with open('../../data/DBpedia/raw/instance_types_transitive_wkd_uris_en.ttl') as f:
        for line in f:
            line = line.split(' ')[:3]
            if 'type' in line[1]:
                is_data.append(line)
    is_data = pd.DataFrame(is_data, columns=['h', 'r', 't'])
    return is_data.drop_duplicates()

In [4]:
def read_kg():
    kg_data = []
    with open('../../data/DBpedia/raw/mappingbased_objects_wkd_uris_en.ttl') as f:
        for line in f:
            line = line.split(' ')[:3]
            if 'type' not in line[1] and 'homepage' not in line[1] and line[0] != '#':
                kg_data.append(line)
    kg_data = pd.DataFrame(kg_data, columns=['h', 'r', 't'])
    return kg_data.drop_duplicates()

In [5]:
kg_data = read_kg()
is_data = read_is()
ot_data = read_ot()

In [6]:
es = set(kg_data['h'].unique()) | set(kg_data['t'].unique()) | set(is_data['h'].unique())
cs = set(ot_data['h'].unique()) | set(ot_data['t'].unique()) | set(is_data['t'].unique())

In [12]:
def entity_filter(kg_data, is_data, k):
    entity_frequency = pd.concat([kg_data['h'], kg_data['t']]).value_counts()
    entity_frequency = entity_frequency.reset_index()
    entity_frequency.columns = ['e', 'count']
    valid_entities = entity_frequency[entity_frequency['count'] >= k]['e']
    kg_data = kg_data[kg_data['h'].isin(valid_entities) & kg_data['t'].isin(valid_entities)]
    is_data = is_data[is_data['h'].isin(valid_entities)]
    return valid_entities, kg_data.reset_index(drop=True), is_data

In [13]:
valid_entities, kg_data, is_data = entity_filter(kg_data, is_data, k=50)

In [18]:
def is_transductor(is_data, ot_data):
    ret = []
    ot_dict = ot_data.groupby('h')['t'].apply(lambda x: x.tolist()).to_dict()
    for h, r, t in is_data.values.tolist():
        ret.append([h, r, t])
        try: 
            tts = ot_dict[t] 
        except:
            tts = None
        if tts != None:
            for tt in tts:
                ret.append([h, r, tt])
    ret = pd.DataFrame(ret, columns=['h', 'r', 't'])
    return ret.drop_duplicates()[['h', 't']].reset_index(drop=True)

In [19]:
is_data = is_transductor(is_data, ot_data)

In [26]:
def save_obj(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [23]:
save_root = '../../data/DBpedia/mid/'
kg_data.to_csv(save_root + 'kg_data_all.csv')
ot_data.to_csv(save_root + 'ot.csv')
is_data.to_csv(save_root + 'is_data_all.csv')
valid_entities.to_csv(save_root + 'e.csv')

In [24]:
def split(data, train_ratio=0.95):
    mask = np.random.rand(len(data)) < train_ratio
    train = data[mask].reset_index(drop=True)
    return train
kg_data_all = kg_data
is_data_all = is_data
ot_data = ot_data
print('Done reading data.')
kg_data_train = split(kg_data_all)
is_data_train = split(is_data_all)
kg_data_train.to_csv(save_root + 'kg_data_train.csv')
is_data_train.to_csv(save_root + 'is_data_train.csv')
print('Done splitting data.')
kg_dict_all = kg_data_all.groupby(['h', 'r'])['t'].apply(lambda x: x.tolist()).to_dict()
is_dict_all = is_data_all.groupby('h')['t'].apply(lambda x: x.tolist()).to_dict()
kg_dict_train = kg_data_train.groupby(['h', 'r'])['t'].apply(lambda x: x.tolist()).to_dict()
is_dict_train = is_data_train.groupby('h')['t'].apply(lambda x: x.tolist()).to_dict()
print('Done getting mapper.')

Done reading data.
Done splitting data.
Done getting mapper.


In [27]:
save_obj(kg_dict_all, save_root + 'kg_dict_all.pkl')
save_obj(is_dict_all, save_root + 'is_dict_all.pkl')
save_obj(kg_dict_train, save_root + 'kg_dict_train.pkl')
save_obj(is_dict_train, save_root + 'is_dict_train.pkl')