In [1]:
import os
import glob
import pandas as pd
import scipy
import scipy.io
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

ROOT = os.path.abspath(os.path.join(".", os.pardir))
DS_DIR = os.path.join(ROOT, "datasets")

In [2]:
def read_all_data(dataset):
    dataset_dir = os.path.join(DS_DIR, dataset)
    test_file = glob.glob(os.path.join(dataset_dir, "*test.txt"))[0];
    train_file = glob.glob(os.path.join(dataset_dir, "*train.txt"))[0];
    valid_file = glob.glob(os.path.join(dataset_dir, "*valid.txt"))[0];
    
    test = pd.read_csv(test_file, delim_whitespace=True,header=None)
    train = pd.read_csv(train_file, delim_whitespace=True,header=None)
    valid = pd.read_csv(valid_file, delim_whitespace=True,header=None)
    test.columns = ['heads', 'relations', "tails"]
    train.columns = ['heads', 'relations', "tails"]
    valid.columns = ['heads', 'relations', "tails"]
    
    return (test, train, valid)

In [3]:
FB15k_test, FB15k_train, FB15k_valid = read_all_data("FB15k")

In [68]:
df = FB15k_train
df

Unnamed: 0,heads,relations,tails
0,/m/027rn,/location/country/form_of_government,/m/06cx9
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8
...,...,...,...
483137,/m/0gpx6,/award/award_nominated_work/award_nominations....,/m/0gq6s3
483138,/m/020jqv,/award/award_nominee/award_nominations./award/...,/m/09d3b7
483139,/m/0524b41,/award/award_winning_work/awards_won./award/aw...,/m/0lp_cd3
483140,/m/0kvsb,/people/person/education./education/education/...,/m/050xpd


In [78]:
from py2neo import Graph
from py2neo.data import Node, Relationship
import neo4jupyter
neo4jupyter.init_notebook_mode()

graph = Graph("bolt://localhost:7687", auth=("neo4j", "123456"))

# Phải tải và start Neo4j trước khi bắt đầu
# Có thể tạo mới và nó sẽ hỏi mật khẩu, đường dẫn sẽ có khi start neo4j database

# Thử xóa tất cả các node đã tạo
# graph.delete_all()

<IPython.core.display.Javascript object>

In [79]:
df_relations = pd.Series(df["relations"])
df_heads = pd.Series(df["heads"])
df_tails = pd.Series(df["tails"])

heads = df_heads.unique()
relations = df_relations.unique()
tails = df_tails.unique()

print(heads.shape)
print(relations.shape)
print(tails.shape)

(14834,)
(1345,)
(14903,)


In [84]:
graph.delete_all()
tx = graph.begin()
for index, row in FB15k_train[:2000].iterrows():
    head = Node("Entities", name=row[0].strip())
    tx.create(head)
    
    tail = Node("Entities", name=row[2].strip())
    tx.create(tail)
    
    relation = Relationship(head, row[1].strip(), tail)
    tx.create(relation)
    
tx.commit()

<Bookmark '9b31144f-ed75-4e9a-898c-989d4d7eb616:15409'>

#### Merge các node có cùng đỉnh lại với nhau

In [86]:
merge_query = """
MATCH (n:Entities)
WITH toLower(n.name) as name, collect(n) as nodes
CALL apoc.refactor.mergeNodes(nodes) yield node
RETURN *
"""
graph.run(merge_query)

 name      | node                                | nodes                                                     
-----------|-------------------------------------|-----------------------------------------------------------
 /m/027rn  | (_0:Entities {name: '/m/027rn'})    | [(_0:Entities {name: '/m/027rn'}), (_467 {}), (_1226 {})] 
 /m/06cx9  | (_202:Entities {name: '/m/06cx9'})  | [(_202:Entities {name: '/m/06cx9'}), (_5116 {})]          
 /m/017dcd | (_203:Entities {name: '/m/017dcd'}) | [(_203:Entities {name: '/m/017dcd'})]                     

In [92]:
graph.nodes[1234]

Node('Entities', name='/m/0168cl')

In [81]:
options = {"Entities": "name"}
neo4jupyter.draw(graph, {"Entities": "name"})