##Free Base Data Format

### Entities
```csv
entity_id,name,type
/m/01g3,Barack Obama,Person
/m/02hrh,Google,Organization
/m/07s9,The Great Wall of China,Location
...

```
### Relations
```csv
relation_id,entity_id1,entity_id2
/people/person/spouse,/m/01g3,/m/0c1v
/organization/organization/founders,/m/02hrh,/m/0c1v
/location/location/contains,/m/07s9,/m/0c1v
...

```
### Node Properties (Features)
```csv
entity_id,property,value
/m/01g3,/people/person/birth_date,1961-08-04
/m/01g3,/people/person/education,/m/0c1v
/m/02hrh,/organization/organization/founding_date,1998-09-04
...

```


Indexing pathquery datasets

In [None]:
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix  #sparse matrix for adj_mat

class Vocab(object):
  def __init__(self):
    self.word2id = {}
    self.id2word = []

  def add(self, word):
    if word not in self.word2id:
      self.word2id[word] = len(self.id2word)
      self.id2word.append(word)

  def __len__(self):
    return len(self.id2word)

  #get the id using Vocab()['word'], like call but we index [] with getitem and we call functions
  #with call ()
  def __getitem__(self, word):
    return self.word2id[word]



  @classmethod
  def load(cls, vocab_path):  #alternative method of initializing class
    v = Vocab()
    with open(vocab_path, 'r') as f:
      for word in f:    #loop over lines for txt files
        v.add(word.strip())  #striping spaces and newline and adding just the word
    return v


class Dataset(object):
  def __init__(self, samples):
    assert type(samples) == list or type(samples) == np.ndarray
    self._samples = samples if type(samples) == np.ndarray else np.array(samples)

  def __getitem__(self, item):
    return self._samples[item]

  def __len__(self):
    return len(self._samples)

  def batch_iter(self, batchsize, rand_flg=True):
    indices = np.random.permutation(len(self)) if rand_flg else np.arange(len(self))
    for start in range(0, len(self), batchsize):
      yield self[indices[start:start+batchsize]] #using yield is faster, more efficient and stateful. it loads only current batch to memory

  @classmethod
  def load(cls, data_path, ent_vocab, rel_vocab):
    raise NotImplementedError


class TripleDataset(Dataset):  #src, rel, dst
  def __init__(self, samples):
    super().__init__(samples=samples)

  @classmethod
  def load(cls, data_path, ent_vocab, rel_vocab):
    samples = []
    with open(data_path, "r") as f:
      for line in f:
        sub, rel, obj = line.strip().split("\t")
        samples.append((ent_vocab[sub], rel_vocab[rel], ent_vocab[obj]))
    return cls(samples)


class TensorTypeGraph(object):    #fills up an edge list based on sample list
  def __init__(self, triple_dat, n_ent, n_rel):
    self.rel2mat = [lil_matrix((n_ent, n_ent)) for _ in range(n_rel)] #whether any two entity has any relationship
    for triple in triple_dat.batch_iter(1, rand_flag=False): #batch_iter method is iterable
      sub, rel, obj = triple[0] #sample from triplet dataset
      self.rel2mat[rel][sub, obj] = 1.0   #fill up relational matrix

  def search_obj_id(self, sub, rel):
    return np.where(self.rel2mat[rel][sub].todense() == 1)[1]  #column index

  def search_sub_id(self, obj, rel):
    return np.where(self.rel2mat[rel][:, obj].todense() == 1)[0]  #row index (because of nonsymmetry)

  @classmethod
  def load_from_raw(cls, data_path, ent_v, rel_v):
    triples = TripletDataset.load(data_path, ent_v, rel_v)
    return cls(triples, len(ent_v), len(rel_v))










####Load dataset of form + pytorch training
```json
data = {
        'head': ['A', 'B', 'C', 'A', 'C'],
        'relation': ['r1', 'r2', 'r3', 'r1', 'r2'],
        'tail': ['B', 'C', 'A', 'C', 'B']
    }

  ```

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from collections import defaultdict
import random

In [None]:
#load data
def load_data(file_path):
  data = pd.read_csv(file_path, sep="\t", header=None)
  data.columns = ['head', 'relation', 'tail']
  return data

def create_mappings(data):
  entities = set(data["head"]).union(set(data["tail"]))
  relations = set(data["relation"])

  entity_to_id = {entity: i for i, entity in enumerate(entities)}
  relation_to_id = {relation: i for i, relation in enumerate(relations)}

  return entity_to_id, relation_to_id

def encode_triplets(data, entity_to_id, relation_to_id):
  head_ids = data["head"].map(entity_to_id).values   #a pd method
  relation_ids = data["relation"].map(relation_to_id).values
  tail_ids = data["tail"].map(entity_to_id).values   #1d np array of idx

  return head_ids, relation_ids, tail_ids


data = load_data("FB15k.txt")
entity_to_id, relation_to_id = create_mappings(data)
head_ids, relation_ids, tail_ids = encode_triplets(data, entity_to_id, relation_to_id)

In [None]:
#TransE model
class TransE(nn.Module):
  def __init__(self, num_entities, num_relations, embedding_dim, margin=1.0):
    super().__init__()
    self.embedding_dim = embedding_dim
    self.margin = margin

    self.entity_embeddings = nn.Embedding(num_entities, embedding_dim)
    self.relation_embeddings = nn.Embedding(num_relations, embedding_dim)

    #initialize embeddings
    nn.init.xavier_uniform_(self.entity_embeddings.weight.data)  #weight is fine
    nn.init.xavier_uniform_(self.relation_embeddings.weight.data)

  def forward(self, heads, relations, tails, negative_heads, negative_tails):
    head_emb = self.entity_embeddings(heads)
    relation_emb = self.relation_embeddings(relations)
    tail_emb = self.entity_embeddings(tails)
    negative_head_emb = self.entity_embeddings(negative_heads)
    negative_tail_emb = self.entity_embeddings(negative_tails)

    pos_distance = torch.linalg.norm(head_emb + relation_emb - tail_emb, ord=1, dim=1) #reduce operation
    neg_distance = torch.linalg.norm(negative_head_emb + relation_emb - negative_tail_emb, ord=1, dim=1)  #TransE

    return pos_distance, neg_distance

  def loss(self, pos_distance, neg_distance):
    # pos_distance should be less than neg_distance by a margin. loss will be high until this is obtained
    return torch.mean(torch.relu(self.margin+pos_distance - neg_distance))





In [7]:
#generate negative samples
def generate_negative_samples(heads, tails, relation):
  num_entities = max(max(heads), max(tails)) + 1
  neg_dict = defaultdict(list)
  head_idx, tail_idx, relation_idx = list(heads), list(tails), list(relation)
  rel_idx = 0
  while True:
    neg_head = random.choice(range(num_entities))
    neg_tail = random.randint(0, num_entities-1)
    if (neg_head, relation_idx[rel_idx], neg_tail) not in zip(head_idx, relation_idx, tail_idx):
      neg_dict['head'].append(neg_head)
      neg_dict['tail'].append(neg_tail)
      rel_idx += 1
    if len(neg_dict['head']) == len(head_idx):
      break
  neg_heads = np.array(neg_dict['head'])
  neg_tails = np.array(neg_dict['tail'])

  return neg_heads, neg_tails

def train_model(model, data, entity_to_id, relation_to_id, epochs=100, batch_size=128, lr=0.001):
  optimizer = optim.Adam(model.parameters(), lr=lr)
  head_ids, relation_ids, tail_ids = encode_triplets(data, entity_to_id, relation_to_id)
  num_entities = len(entity_to_id)
  num_batches = len(head_ids) // batch_size

  for epoch in range(epochs):
    total_loss = 0.0
    for batch in range(num_batches):
      batch_start = batch * batch_size
      batch_end = (batch + 1) * batch_size
      heads = torch.tensor(head_ids[batch_start:batch_end])
      relations = torch.tensor(relation_ids[batch_start:batch_end])
      tails = torch.tensor(tail_ids[batch_start:batch_end])

      negative_heads, negative_tails = generate_negative_samples(heads, tails, relations)
      negative_heads = torch.tensor(negative_heads)
      negative_tails = torch.tensor(negative_tails)

      optimizer.zero_grad()
      pos_distance, neg_distance = model(heads, relations, tails, negative_heads, negative_tails)
      loss = model.loss(pos_distance, neg_distance)

      loss.backward()
      optimizer.step()

      total_loss += loss.item()
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

tensor([3., 7.])

### Random Graph Generation

In [None]:
#erdos renyi random graph model
import torch
from torch_geometric.utils import erdos_renyi_graph
import torch_geometric
from torch_geometric.data import Data
import networkx as nx
import matplotlib.pyplot as plt

#Erdos params
num_nodes = 100
edge_prob = 0.1

#generated edge index
edge_index = erdos_renyi_graph(num_nodes, edge_prob)

#creating a PyG Data Object
data = Data(edge_index=edge_index)

def plot_graph(data):
  G = nx.Graph()
  G.add_edges_from(data.edge_index.t().tolist())
  pos = nx.spring_layout(G, seed=1)
  nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=500,edge_color="gray")
  plt.show()

plot_graph(data)