# Libraries

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import re
import random
import itertools
import warnings
warnings.simplefilter("ignore")

from sklearn import preprocessing

import dgl
from dgl.data import DGLDataset
import dgl.nn as dglnn
from dgl.nn import SAGEConv
from dgl.nn import GraphConv
import dgl.function as fn

import torch
import torch.nn as nn
import torch.nn.functional as F

import pygraphviz as pgv

import scipy.sparse as sp

In [88]:
n_input_feat = 50
n_hidden_feat = 100
n_output_feat = 50

n_epochs = 150

k = 10 # for negative graph: each edge gets k negative examples

# 1) Create DGL Heterograph

## a. Get KG

In [89]:
f = open('../Input Data/statements.nq', 'r')
text = f.readlines()

In [90]:
rows = []
for line in text:
  split = line.split()
  s = split[0]
  s = s.replace('<', '')
  s = s.replace('>', '')
  p = split[1]
  p = p.replace('<', '')
  p = p.replace('>', '')
  o = split[2]
  o = o.replace('<', '')
  o = o.replace('>', '')
  rows.append([s, p, o])

In [91]:
triples = pd.DataFrame(rows, columns=['subject', 'predicate', 'object'])
triples.head()

Unnamed: 0,subject,predicate,object
0,http://idea.rpi.edu/heals/kb/usda#01003,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://idea.rpi.edu/heals/kb/usda
1,http://idea.rpi.edu/heals/kb/usda#01004,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://idea.rpi.edu/heals/kb/usda
2,http://idea.rpi.edu/heals/kb/usda#01005,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://idea.rpi.edu/heals/kb/usda
3,http://idea.rpi.edu/heals/kb/usda#01006,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://idea.rpi.edu/heals/kb/usda
4,http://idea.rpi.edu/heals/kb/usda#01007,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://idea.rpi.edu/heals/kb/usda


In [92]:
print(f'# Triples: {len(triples)}')

# Triples: 3022948


## b. Convert KG to pd.DataFrame

In [93]:
all_foods = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#'))]
le_food = preprocessing.LabelEncoder()
le_food.fit(all_foods['subject'].tolist())

LabelEncoder()

In [94]:
def map_value_2_id(df, value, name):
    le = preprocessing.LabelEncoder()
    le.fit(df[value].tolist())
    df[name] = le.transform(df[value].tolist())
    return df, le

**(Food, containsNutrient, Nutrient)**

In [95]:
df_food_nutrient = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#')) &
                        (triples['predicate'] == 'http://www.w3id.org/foodkg/property#contains') &
                        (triples['object'].str.startswith('http://www.w3id.org/foodkg/usda#'))]   

df_food_nutrient['source_id'] = le_food.transform(df_food_nutrient['subject'].tolist())
df_food_nutrient, le_nutrient = map_value_2_id(df_food_nutrient, 'object', 'destination_id')

**(Food, hasTag, Tag)**

In [96]:
df_food_tag = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#')) &
                      (triples['predicate'] == 'http://www.w3id.org/foodb/property/hasQuality') &
                      (triples['object'].str.startswith('https://w3id.org/foodkg/quality/high_'))]   

df_food_tag['source_id'] = le_food.transform(df_food_tag['subject'].tolist())
df_food_tag, le_tag = map_value_2_id(df_food_tag, 'object', 'destination_id')

**(Food, isInCategory, Category)**

In [97]:
df_food_cat = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#')) &
                      (triples['predicate'] == 'http://www.w3id.org/foodb/property/hasCategory') &
                      (triples['object'].str.startswith('http://idea.rpi.edu/heals/kb/usda#'))]   

df_food_cat['source_id'] = le_food.transform(df_food_cat['subject'].tolist())
df_food_cat, le_category = map_value_2_id(df_food_cat, 'object', 'destination_id')

**(Food, hasFlavor, Flavor)**

In [98]:
df_food_flavor = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#')) &
                      (triples['predicate'] == 'http://www.w3id.org/foodb/property/hasFlavor') &
                      (triples['object'].str.startswith('https://w3id.org/foodkg/flavor/'))]   

df_food_flavor['source_id'] = le_food.transform(df_food_flavor['subject'].tolist())
df_food_flavor, le_flavor = map_value_2_id(df_food_flavor, 'object', 'destination_id')

**(Product, containsIngredient, Ingredient)**

In [99]:
df_product_ingredient = triples[(triples['subject'].str.startswith('https://w3id.org/um/ken4256/product/')) &
                       (triples['predicate'] == 'https://www.bbc.co.uk/ontologies/fo/ingredients') &
                       (triples['object'].str.startswith('https://w3id.org/um/ken4256/ingredient/'))]   

df_product_ingredient, le_product = map_value_2_id(df_product_ingredient, 'subject', 'source_id')
df_product_ingredient, le_ingredient = map_value_2_id(df_product_ingredient, 'object', 'destination_id')

**(Food, isSimilarTo, Ingredient) - (Ingredient, isSimilarTo, Food)**

In [100]:
df_food_ingredient = triples[(triples['subject'].str.startswith('http://idea.rpi.edu/heals/kb/usda#')) &
                      (triples['predicate'] == 'https://schema.org/isSimilarTo') &
                      (triples['object'].str.startswith('https://w3id.org/um/ken4256/ingredient/'))]   

df_food_ingredient['source_id'] = le_food.transform(df_food_ingredient['subject'].tolist())
df_food_ingredient['destination_id'] = le_ingredient.transform(df_food_ingredient['object'].tolist())

## c. Get Ground Truth and convert to pd.DataFrame

**(Food, isSubstitutedBy, Food Subs) - (Food Subs, canSubstitute, Food)**

In [101]:
df_food_subs = pd.read_csv('../Input Data/final_substitution.csv', sep=';')

df_food_subs['source_id'] = le_food.transform(df_food_subs['Food id'].tolist())
df_food_subs['destination_id'] = le_food.transform(df_food_subs['Substitution id'].tolist())

## d. Create DGL HeteroGraph

In [102]:
g = dgl.heterograph({('Food', 'isSubstitutedBy', 'Food Subs'): (torch.tensor(df_food_subs['source_id'].tolist()), torch.tensor(df_food_subs['destination_id'].tolist())),
                     ('Food Subs', 'substitutes', 'Food'): (torch.tensor(df_food_subs['destination_id'].tolist()), torch.tensor(df_food_subs['source_id'].tolist())),
                     ('Food', 'containsNutrient', 'Nutrient'): (torch.tensor(df_food_nutrient['source_id'].tolist()), torch.tensor(df_food_nutrient['destination_id'].tolist())),
                     ('Food', 'hasTag', 'Tag'): (torch.tensor(df_food_tag['source_id'].tolist()), torch.tensor(df_food_tag['destination_id'].tolist())),
                     ('Food', 'isInCategory', 'Category'): (torch.tensor(df_food_cat['source_id'].tolist()), torch.tensor(df_food_cat['destination_id'].tolist())),
                     ('Food', 'hasFlavor', 'Flavor'): (torch.tensor(df_food_flavor['source_id'].tolist()), torch.tensor(df_food_flavor['destination_id'].tolist())),
                     ('Product', 'containsIngredient', 'Ingredient'): (torch.tensor(df_product_ingredient['source_id'].tolist()), torch.tensor(df_product_ingredient['destination_id'].tolist())),
                     ('Food', 'isSimilarTo', 'Ingredient'): (torch.tensor(df_food_ingredient['source_id'].tolist()), torch.tensor(df_food_ingredient['destination_id'].tolist())),
                     ('Ingredient', 'isSimilarTo', 'Food'): (torch.tensor(df_food_ingredient['destination_id'].tolist()), torch.tensor(df_food_ingredient['source_id'].tolist()))
                    })

In [103]:
for node in g.ntypes:
    g.nodes[node].data['h'] = torch.randn(g.number_of_nodes(node), n_input_feat)

In [104]:
g

Graph(num_nodes={'Category': 13, 'Flavor': 272, 'Food': 9372, 'Food Subs': 6732, 'Ingredient': 125130, 'Nutrient': 63883, 'Product': 71777, 'Tag': 25},
      num_edges={('Food', 'containsNutrient', 'Nutrient'): 300523, ('Food', 'hasFlavor', 'Flavor'): 11167, ('Food', 'hasTag', 'Tag'): 17746, ('Food', 'isInCategory', 'Category'): 1667, ('Food', 'isSimilarTo', 'Ingredient'): 681, ('Food', 'isSubstitutedBy', 'Food Subs'): 1841, ('Food Subs', 'substitutes', 'Food'): 1841, ('Ingredient', 'isSimilarTo', 'Food'): 681, ('Product', 'containsIngredient', 'Ingredient'): 890789},
      metagraph=[('Food', 'Nutrient', 'containsNutrient'), ('Food', 'Flavor', 'hasFlavor'), ('Food', 'Tag', 'hasTag'), ('Food', 'Category', 'isInCategory'), ('Food', 'Ingredient', 'isSimilarTo'), ('Food', 'Food Subs', 'isSubstitutedBy'), ('Ingredient', 'Food', 'isSimilarTo'), ('Food Subs', 'Food', 'substitutes'), ('Product', 'Ingredient', 'containsIngredient')])

In [105]:
g.number_of_nodes('Food')

9372

In [106]:
g.number_of_edges('isSubstitutedBy')

1841

In [107]:
g.ntypes

['Category',
 'Flavor',
 'Food',
 'Food Subs',
 'Ingredient',
 'Nutrient',
 'Product',
 'Tag']

In [108]:
g.etypes

['containsNutrient',
 'hasFlavor',
 'hasTag',
 'isInCategory',
 'isSimilarTo',
 'isSubstitutedBy',
 'substitutes',
 'isSimilarTo',
 'containsIngredient']

In [109]:
g.canonical_etypes

[('Food', 'containsNutrient', 'Nutrient'),
 ('Food', 'hasFlavor', 'Flavor'),
 ('Food', 'hasTag', 'Tag'),
 ('Food', 'isInCategory', 'Category'),
 ('Food', 'isSimilarTo', 'Ingredient'),
 ('Food', 'isSubstitutedBy', 'Food Subs'),
 ('Food Subs', 'substitutes', 'Food'),
 ('Ingredient', 'isSimilarTo', 'Food'),
 ('Product', 'containsIngredient', 'Ingredient')]

In [110]:
def plot_graph(nxg):
    ag = pgv.AGraph(strict = False, directed = True)
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)
    ag.layout('dot')
    ag.draw('../Output/graph.png')

In [111]:
plot_graph(g.metagraph())

## e. Split into train and test sets

In [112]:
#randomly generate training masks on 'isSubstitutedBy' edges
g.edges['isSubstitutedBy'].data['train_mask'] = torch.zeros(g.number_of_edges('isSubstitutedBy'), dtype=torch.bool).bernoulli(0.75)

In [113]:
g.edges['isSubstitutedBy']

EdgeSpace(data={'train_mask': tensor([False,  True, False,  ...,  True,  True,  True])})

In [114]:
train_eid_dict = {'eid_2_train': (g.edges['isSubstitutedBy'].data['train_mask'] == True).nonzero(as_tuple=True)[0]}
test_eid_dict = {'eid_2_test': (g.edges['isSubstitutedBy'].data['train_mask'] == False).nonzero(as_tuple=True)[0]}

In [115]:
train_eid_dict['eid_2_train']

tensor([   1,    3,    5,  ..., 1838, 1839, 1840])

In [116]:
test_eid_dict['eid_2_test']

tensor([   0,    2,    4,    8,   11,   13,   15,   17,   18,   23,   24,   31,
          47,   51,   56,   59,   60,   61,   64,   69,   70,   82,   84,   86,
          88,   89,   90,   94,   97,  105,  106,  107,  111,  113,  124,  130,
         133,  138,  139,  140,  160,  167,  172,  176,  178,  188,  192,  193,
         196,  203,  204,  212,  213,  218,  220,  222,  223,  230,  232,  246,
         248,  249,  250,  253,  256,  261,  262,  263,  267,  278,  279,  280,
         285,  288,  301,  302,  307,  313,  316,  317,  326,  328,  332,  333,
         335,  337,  339,  346,  347,  349,  366,  370,  375,  378,  387,  391,
         393,  400,  401,  403,  405,  406,  407,  408,  410,  411,  418,  421,
         423,  429,  438,  442,  443,  444,  449,  452,  458,  459,  468,  470,
         484,  485,  486,  487,  492,  493,  494,  498,  510,  524,  526,  541,
         544,  547,  554,  558,  577,  580,  583,  584,  585,  586,  587,  591,
         592,  604,  607,  613,  614,  6

In [117]:
g.number_of_edges('isSubstitutedBy')

1841

In [118]:
len(train_eid_dict['eid_2_train'])

1381

In [119]:
len(test_eid_dict['eid_2_test'])

460

In [120]:
g_train = dgl.remove_edges(g, test_eid_dict['eid_2_test'], 'isSubstitutedBy')
g_train = dgl.remove_edges(g_train, test_eid_dict['eid_2_test'], 'substitutes')

In [121]:
g_train

Graph(num_nodes={'Category': 13, 'Flavor': 272, 'Food': 9372, 'Food Subs': 6732, 'Ingredient': 125130, 'Nutrient': 63883, 'Product': 71777, 'Tag': 25},
      num_edges={('Food', 'containsNutrient', 'Nutrient'): 300523, ('Food', 'hasFlavor', 'Flavor'): 11167, ('Food', 'hasTag', 'Tag'): 17746, ('Food', 'isInCategory', 'Category'): 1667, ('Food', 'isSimilarTo', 'Ingredient'): 681, ('Food', 'isSubstitutedBy', 'Food Subs'): 1381, ('Food Subs', 'substitutes', 'Food'): 1381, ('Ingredient', 'isSimilarTo', 'Food'): 681, ('Product', 'containsIngredient', 'Ingredient'): 890789},
      metagraph=[('Food', 'Nutrient', 'containsNutrient'), ('Food', 'Flavor', 'hasFlavor'), ('Food', 'Tag', 'hasTag'), ('Food', 'Category', 'isInCategory'), ('Food', 'Ingredient', 'isSimilarTo'), ('Food', 'Food Subs', 'isSubstitutedBy'), ('Ingredient', 'Food', 'isSimilarTo'), ('Food Subs', 'Food', 'substitutes'), ('Product', 'Ingredient', 'containsIngredient')])

In [122]:
g_train.number_of_edges('isSubstitutedBy')

1381

In [123]:
g_train.number_of_edges('substitutes')

1381

In [124]:
g_test = dgl.remove_edges(g, train_eid_dict['eid_2_train'], 'isSubstitutedBy')
g_test = dgl.remove_edges(g_test, train_eid_dict['eid_2_train'], 'substitutes')

In [125]:
g_test

Graph(num_nodes={'Category': 13, 'Flavor': 272, 'Food': 9372, 'Food Subs': 6732, 'Ingredient': 125130, 'Nutrient': 63883, 'Product': 71777, 'Tag': 25},
      num_edges={('Food', 'containsNutrient', 'Nutrient'): 300523, ('Food', 'hasFlavor', 'Flavor'): 11167, ('Food', 'hasTag', 'Tag'): 17746, ('Food', 'isInCategory', 'Category'): 1667, ('Food', 'isSimilarTo', 'Ingredient'): 681, ('Food', 'isSubstitutedBy', 'Food Subs'): 460, ('Food Subs', 'substitutes', 'Food'): 460, ('Ingredient', 'isSimilarTo', 'Food'): 681, ('Product', 'containsIngredient', 'Ingredient'): 890789},
      metagraph=[('Food', 'Nutrient', 'containsNutrient'), ('Food', 'Flavor', 'hasFlavor'), ('Food', 'Tag', 'hasTag'), ('Food', 'Category', 'isInCategory'), ('Food', 'Ingredient', 'isSimilarTo'), ('Food', 'Food Subs', 'isSubstitutedBy'), ('Ingredient', 'Food', 'isSimilarTo'), ('Food Subs', 'Food', 'substitutes'), ('Product', 'Ingredient', 'containsIngredient')])

In [126]:
g_test.number_of_edges('isSubstitutedBy')

460

In [127]:
g_test.number_of_edges('substitutes')

460

# 2) Train Model

In [128]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs are features/embeddings of nodes
        h = self.conv1(graph, inputs)
        h = {k: F.relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

In [129]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, rel_names):
        super().__init__()
        self.sage = RGCN(in_features, hidden_features, out_features, rel_names)
        self.pred = HeteroDotProductPredictor()
    def forward(self, g, neg_g, x, etype):
        h = self.sage(g, x)
        return h, self.pred(g, h, etype), self.pred(neg_g, h, etype)

In [130]:
def construct_negative_graph(graph, k, etype):
    utype, _, vtype = etype
    src, dst = graph.edges(etype=etype)
    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.number_of_nodes(vtype), (len(src) * k,))
    return neg_src, neg_dst, dgl.heterograph({etype: (neg_src, neg_dst)}, num_nodes_dict={ntype: graph.number_of_nodes(ntype) for ntype in graph.ntypes})

In [131]:
class HeteroDotProductPredictor(nn.Module):
    def forward(self, graph, h, etype):
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype)
            return graph.edges[etype].data['score']

In [132]:
def compute_loss(pos_score, neg_score):
    n_edges = pos_score.shape[0]
    return (1 - neg_score.view(n_edges, -1) + pos_score.unsqueeze(1)).clamp(min=0).mean()

In [133]:
node_features = {}
for node in g.ntypes:
    node_features[node] = g.nodes[node].data['h'] 

In [134]:
model = Model(n_input_feat, n_hidden_feat, n_output_feat, g.etypes)
opt = torch.optim.Adam(model.parameters())
for epoch in range(n_epochs):
    #forward
    neg_src_train, neg_dst_train, negative_graph_train = construct_negative_graph(g_train, k, ('Food', 'isSubstitutedBy', 'Food Subs'))
    h_train, pos_score_train, neg_score_train = model(g_train, negative_graph_train, node_features, ('Food', 'isSubstitutedBy', 'Food Subs'))
    
    #loss
    loss = compute_loss(pos_score_train, neg_score_train)
    
    #backward
    opt.zero_grad()
    loss.backward()
    opt.step()
    
    if epoch % 10 == 0:
        print(f'In epoch {epoch}, loss = {loss.item():.4f}')

In epoch 0, loss = 0.8170
In epoch 10, loss = 0.1408
In epoch 20, loss = 0.1156
In epoch 30, loss = 0.0860
In epoch 40, loss = 0.0505
In epoch 50, loss = 0.0416
In epoch 60, loss = 0.0309
In epoch 70, loss = 0.0244
In epoch 80, loss = 0.0203
In epoch 90, loss = 0.0170
In epoch 100, loss = 0.0166
In epoch 110, loss = 0.0133
In epoch 120, loss = 0.0118
In epoch 130, loss = 0.0110
In epoch 140, loss = 0.0086


# 3) Evaluate Model

https://github.com/dglai/WWW20-Hands-on-Tutorial/blob/master/basic_tasks/3_link_predict.ipynb

In [64]:
with torch.no_grad():
    neg_src_test, neg_dst_test, negative_graph_test = construct_negative_graph(g_test, k, ('Food', 'isSubstitutedBy', 'Food Subs'))
    h_test, pos_score_test, neg_score_test = model(g_test, negative_graph_test, node_features, ('Food', 'isSubstitutedBy', 'Food Subs'))
    loss = compute_loss(pos_score_test, neg_score_test)

In [65]:
# get postive test edges
df_food_subs_test = df_food_subs[df_food_subs.index.isin(test_eid_dict['eid_2_test'].tolist())]
test_pos_u = df_food_subs_test['source_id'].tolist()
test_pos_v = df_food_subs_test['destination_id'].tolist()

In [66]:
# get negative test edges
test_neg_u = neg_src_test
test_neg_v = neg_dst_test

In [67]:
# get all test edges
test_u = torch.cat([torch.as_tensor(test_pos_u), torch.as_tensor(test_neg_u)])
test_v = torch.cat([torch.as_tensor(test_pos_v), torch.as_tensor(test_neg_v)])

In [68]:
# get y_true 
y_true = torch.cat([torch.zeros(len(pos_score_test)), torch.ones(len(neg_score_test))])

In [69]:
# get predictions
predictions = torch.cat([torch.as_tensor(pos_score_test.reshape(pos_score_test.shape[0])), torch.as_tensor(neg_score_test.reshape(neg_score_test.shape[0]))])
predictions = torch.sigmoid(predictions)
print(f'Accuracy = {(((predictions >= 0.5) == y_true).sum().item() / len(predictions)):.4f}')

Accuracy = 0.9880


In [70]:
# get y_pred
y_pred = []
for pred in predictions:
    if pred >= 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

In [71]:
results = pd.DataFrame()
results['u'] = test_u
results['v'] = test_v
results['Food id'] = le_food.inverse_transform(results['u'])
results['Substitution id'] = le_food.inverse_transform(results['v'])
results['predictions'] = predictions
results['label_pred'] = y_pred
results['label_true'] = y_true

In [139]:
results['check'] = results['label_pred'] - results['label_true']

In [73]:
results.to_csv('../Output/results.csv')

In [74]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, average=None)

(array([0.93298969, 1.        ]),
 array([1.        , 0.98563536]),
 array([0.96533333, 0.99276572]),
 array([181, 905], dtype=int64))

# 4) Save Embeddings to RDF2Vec Format

In [135]:
foods = le_food.inverse_transform(g_test.nodes('Food'))

In [136]:
food_embeddings = dict(zip(foods, h_test['Food']))

In [138]:
fw = open('../Output/food_embeddings.txt','w')
fw.write(str(len(foods))+' '+str(len(h_test['Food'][0]))+'\n')
for food in foods:
    p
    fw.write(food+' ')
    for i in range(len(h_test['Food'][0])):
        value = str(food_embeddings[food][i].item()).strip()
        fw.write(value+' ')
    fw.write('\n')