In [1]:
import json
import copy
import gzip
import glob
import random
import rdflib as rdf
import os
from tqdm import tqdm_notebook
from rdflib import Graph
from collections import Counter

In [12]:
class RDFReader:
    __graph = None
    __freq = {}

    def __init__(self, file_name):

        self.__graph = rdf.Graph()

        file_path = os.path.join("../dataset/raw_dataset", file_name)
        self.__graph.parse(file_path, format="ttl")    
        self.__freq = Counter(self.__graph.predicates())

        print("Graph loaded, frequencies counted.")
    
    def triples(self, relation=None):
        for s, p, o in self.__graph.triples((None, relation, None)):
            yield s, p, o

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.__graph.destroy("store")
        self.__graph.close(True)

    def subjectSet(self):
        return set(self.__graph.subjects())

    def objectSet(self):
        return set(self.__graph.objects())

    def relationList(self):
        """
        Returns a list of relations, ordered descending by frequenecy
        :return:
        """
        res = list(set(self.__graph.predicates()))
        res.sort(key=lambda rel: -self.freq(rel))
        return res

    def __len__(self):
        return len(self.__graph)

    def freq(self, relation):
        """zzz
        The frequency of this relation (how many distinct triples does it occur in?)
        :param relation:
        :return:
        """
        if relation not in self.__freq:
            return 0
        return self.__freq[relation]

In [6]:
file_dir = "../dataset/raw_dataset"
file_ext = ".ttl"
file_list = [os.path.join(file_dir, _) for _ in os.listdir(file_dir) if _.endswith(file_ext)]

In [10]:
# for item in file_list:
#     print(item)

In [20]:
with RDFReader("06_Food.ttl") as reader:
    relations = reader.relationList()
    subjects = reader.subjectSet()
    objects = reader.objectSet()

Graph loaded, frequencies counted.


In [33]:
triple_set = set()
entity_relation = set()
for rel in tqdm_notebook(relations):
#     if "relatedTerm" in rel:
#         continue
    thereis_entity = False
    for j, (s,p,o) in enumerate(reader.triples(relation=rel)):
        if 'http://kb.saltlux.ai/resource' in s and len(s.split(' ')) == 1:
            if 'http://kb.saltlux.ai/resource' in o and len(o.split(' ')) == 1:
                triple_set.add((s.strip(),p.strip(),o.strip()))
                thereis_entity = True
    if thereis_entity:
        entity_relation.add(rel)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for rel in tqdm_notebook(relations):


  0%|          | 0/97 [00:00<?, ?it/s]

In [34]:
entities = set()
for rel in tqdm_notebook(list(entity_relation)):
    for j, (s,p,o) in enumerate(reader.triples(relation=rel)):
        if 'http://kb.saltlux.ai/resource' in s:
            if len(s.split(' ')) == 1:
                entities.add(s.strip())
        if 'http://kb.saltlux.ai/resource' in o:
            if len(o.split(' ')) == 1:
                entities.add(o.strip())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for rel in tqdm_notebook(list(entity_relation)):


  0%|          | 0/57 [00:00<?, ?it/s]

In [35]:
split_ratio = 0.05
random.seed(1234)

triple_list = list(triple_set)
random.shuffle(triple_list)
test_list = triple_list[:int(len(triple_list)*split_ratio)]
valid_list = triple_list[int(len(triple_list)*split_ratio):2*int(len(triple_list)*split_ratio)]
train_list = triple_list[int(2*len(triple_list)*split_ratio):]

In [36]:
print(len(entities))
print(len(entity_relation))
print(len(train_list))
print(len(valid_list))
print(len(test_list))

5691
57
13526
751
751


In [37]:
name = "food"

with open(os.path.join(file_dir, name, 'entities.dict'), 'w', encoding="utf-8") as f_e:
    for i, item in enumerate(list(entities)):
        f_e.write(str(i) + '\t' + item + '\n')
with open(os.path.join(file_dir, name, 'relations.dict'), 'w', encoding="utf-8") as f_r:
    for i, item in enumerate(list(entity_relation)):
        f_r.write(str(i) + '\t' + item + '\n')
with open(os.path.join(file_dir, name, 'train.txt'), 'w', encoding="utf-8") as f_tr:
    for i, item in enumerate(tqdm_notebook(train_list)):
        f_tr.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\n')
with open(os.path.join(file_dir, name, 'valid.txt'), 'w', encoding="utf-8") as f_v:
    for i, item in enumerate(tqdm_notebook(valid_list)):
        f_v.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\n')
with open(os.path.join(file_dir, name, 'test.txt'), 'w', encoding="utf-8") as f_te:
    for i, item in enumerate(tqdm_notebook(test_list)):
        f_te.write(item[0] + '\t' + item[1] + '\t' + item[2] + '\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, item in enumerate(tqdm_notebook(train_list)):


  0%|          | 0/13526 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, item in enumerate(tqdm_notebook(valid_list)):


  0%|          | 0/751 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, item in enumerate(tqdm_notebook(test_list)):


  0%|          | 0/751 [00:00<?, ?it/s]