In [1]:
import torch
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import random
import os
import math
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
raw_data = pd.read_csv('/content/gdrive/MyDrive/colab notebook/fb15k/freebase_mtr100_mte100-train.txt', sep='\t', header=None, names=['head', 'relation', 'tail'], keep_default_na=False, encoding='utf-8')

In [3]:
raw_data = raw_data.applymap(lambda x : x.strip())
raw_data

Unnamed: 0,head,relation,tail
0,/m/027rn,/location/country/form_of_government,/m/06cx9
1,/m/017dcd,/tv/tv_program/regular_cast./tv/regular_tv_app...,/m/06v8s0
2,/m/07s9rl0,/media_common/netflix_genre/titles,/m/0170z3
3,/m/01sl1q,/award/award_winner/awards_won./award/award_ho...,/m/044mz_
4,/m/0cnk2q,/soccer/football_team/current_roster./sports/s...,/m/02nzb8
...,...,...,...
483137,/m/0gpx6,/award/award_nominated_work/award_nominations....,/m/0gq6s3
483138,/m/020jqv,/award/award_nominee/award_nominations./award/...,/m/09d3b7
483139,/m/0524b41,/award/award_winning_work/awards_won./award/aw...,/m/0lp_cd3
483140,/m/0kvsb,/people/person/education./education/education/...,/m/050xpd


In [4]:
raw_data.values.shape

(483142, 3)

In [5]:
A=['a', 'b', 'c', 'd', 'e', 'c']
a=Counter(A)
print(a)
b=list(a.keys())
c=dict((word, idx) for idx, word in enumerate(b))
c['a']

Counter({'c': 2, 'a': 1, 'b': 1, 'd': 1, 'e': 1})


0

In [6]:
# Preprocessing Data: Reference: https://github.com/toooooodo/pytorch-TransE
class TrainSet(Dataset):
    def __init__(self):
        super(TrainSet, self).__init__()
        self.raw_data, self.entity_to_index, self.relation_to_index = self.load_text()
        self.entity_num, self.relation_num = len(self.entity_to_index), len(self.relation_to_index)
        self.triple_num = self.raw_data.shape[0]
        print(f'Train set: {self.entity_num} entities, {self.relation_num} relations, {self.triple_num} triplets.')
        self.pos_data = self.convert_word_to_index(self.raw_data)
        self.related_dic = self.get_related_entity()
        self.neg_data = self.generate_neg()

    def __len__(self):
        return self.triple_num

    def __getitem__(self, item):
        return [self.pos_data[item], self.neg_data[item]]

    def load_text(self):
        raw_data = pd.read_csv('/content/gdrive/MyDrive/colab notebook/fb15k/freebase_mtr100_mte100-train.txt', sep='\t', header=None,
                               names=['head', 'relation', 'tail'],
                               keep_default_na=False, encoding='utf-8')
        raw_data = raw_data.applymap(lambda x: x.strip())
        head_count = Counter(raw_data['head'])
        tail_count = Counter(raw_data['tail'])
        relation_count = Counter(raw_data['relation'])
        entity_list = list((head_count + tail_count).keys())
        relation_list = list(relation_count.keys())
        entity_dic = dict([(word, idx) for idx, word in enumerate(entity_list)])
        relation_dic = dict([(word, idx) for idx, word in enumerate(relation_list)])
        return raw_data.values, entity_dic, relation_dic

    def convert_word_to_index(self, data):
        index_list = np.array([
            [self.entity_to_index[triple[0]], self.relation_to_index[triple[1]], self.entity_to_index[triple[2]]] for
            triple in data])
        return index_list

    def generate_neg(self):
        """
        generate negative sampling
        :return: same shape as positive sampling
        """

        neg_candidates, i = [], 0
        neg_data = []
        population = list(range(self.entity_num))
        for idx, triple in enumerate(self.pos_data):
            while True:
                if i == len(neg_candidates):
                    i = 0
                    neg_candidates = random.choices(population=population, k=int(1e4))
                neg, i = neg_candidates[i], i + 1
                if random.randint(0, 1) == 0:
                    # replace head
                    if neg not in self.related_dic[triple[2]]:
                        neg_data.append([neg, triple[1], triple[2]])
                        break
                else:
                    # replace tail
                    if neg not in self.related_dic[triple[0]]:
                        neg_data.append([triple[0], triple[1], neg])
                        break
        

        return np.array(neg_data)

    def get_related_entity(self):
        """
        get related entities
        :return: {entity_id: {related_entity_id_1, related_entity_id_2...}}
        """
        related_dic = dict()
        for triple in self.pos_data:
            if related_dic.get(triple[0]) is None:
                related_dic[triple[0]] = {triple[2]}
            else:
                related_dic[triple[0]].add(triple[2])
            if related_dic.get(triple[2]) is None:
                related_dic[triple[2]] = {triple[0]}
            else:
                related_dic[triple[2]].add(triple[0])
        return related_dic
        
class TestSet(Dataset):
    def __init__(self):
        super(TestSet, self).__init__()
        self.raw_data = self.load_text()
        self.data = self.raw_data
        print(f"Test set: {self.raw_data.shape[0]} triplets")

    def __getitem__(self, item):
        return self.data[item]

    def __len__(self):
        return self.data.shape[0]

    def load_text(self):
        raw_data = pd.read_csv('/content/gdrive/MyDrive/colab notebook/fb15k/freebase_mtr100_mte100-test.txt', sep='\t', header=None,
                               names=['head', 'relation', 'tail'],
                               keep_default_na=False, encoding='utf-8')
        raw_data = raw_data.applymap(lambda x: x.strip())
        return raw_data.values

    def convert_word_to_index(self, entity_to_index, relation_to_index, data):
        index_list = np.array(
            [[entity_to_index[triple[0]], relation_to_index[triple[1]], entity_to_index[triple[2]]] for triple in data])
        self.data = index_list

In [7]:
A=torch.tensor([[[1], [2], [3]]])
A.expand(4,3,1)
A.expand(4,3,4)

tensor([[[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]],

        [[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]],

        [[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]],

        [[1, 1, 1, 1],
         [2, 2, 2, 2],
         [3, 3, 3, 3]]])

In [8]:
B=torch.tensor([[1,2,3], [4,3,6], [1,3,7]])
B=B.unsqueeze(dim=1).expand(3, 4, 3)
print(B)
v, i =torch.topk( B, k=2, dim=1)
print(v)
print(i)

tensor([[[1, 2, 3],
         [1, 2, 3],
         [1, 2, 3],
         [1, 2, 3]],

        [[4, 3, 6],
         [4, 3, 6],
         [4, 3, 6],
         [4, 3, 6]],

        [[1, 3, 7],
         [1, 3, 7],
         [1, 3, 7],
         [1, 3, 7]]])
tensor([[[1, 2, 3],
         [1, 2, 3]],

        [[4, 3, 6],
         [4, 3, 6]],

        [[1, 3, 7],
         [1, 3, 7]]])
tensor([[[2, 2, 2],
         [3, 3, 3]],

        [[2, 2, 2],
         [3, 3, 3]],

        [[2, 2, 2],
         [3, 3, 3]]])


In [9]:
#Transe code
class Transe(nn.Module):
  def __init__(self, entity_num, relation_num, emb_dim, gamma ):
    super().__init__()
    self.entity_num = entity_num
    self.relation_num = relation_num
    self.emb_dim = emb_dim
    self.gamma = gamma
    self.entity_embedding = nn.Embedding(entity_num, emb_dim)
    self.relation_embedding = nn.Embedding(relation_num, emb_dim)
    nn.init.normal_(self.entity_embedding.weight, std=6 / math.sqrt(self.emb_dim))
    nn.init.normal_(self.relation_embedding.weight, std=6 / math.sqrt(self.emb_dim))
    relation_norm = torch.norm(self.relation_embedding.weight.data, dim=1, keepdim=True)
    self.relation_embedding.weight.data = self.relation_embedding.weight.data / relation_norm

  def forward(self, ph, pr, pt, nh, nr, nt):
    return self.entity_embedding(ph)+self.relation_embedding(pr)-self.entity_embedding(pt), self.entity_embedding(nh)+self.relation_embedding(nr)-self.entity_embedding(nt)
  
  def loss(self, pd, nd):
    distance_diff = self.gamma + torch.norm(pd, p=2, dim=1) - torch.norm(nd, p=2 ,dim=1)
    return torch.sum(F.relu(distance_diff))
  
  def top_k(self, h, r, t, k=10):
    hr = self.entity_embedding(h)+self.relation_embedding(r)
    hr=hr.unsqueeze(dim=1).expand(hr.shape[0], self.entity_num, self.emb_dim)
    emb_tail=self.entity_embedding.weight.data.expand(hr.shape[0], self.entity_num, self.emb_dim)
    values, indices = torch.topk(torch.norm(hr - emb_tail, dim=2), k, dim=1, largest=False)
    t = t.view(-1, 1)
    return torch.sum(torch.eq(indices, t)).item()
 


In [10]:
device = torch.device('cuda')
emb_dim=50
gamma=1
gamma = torch.FloatTensor([gamma]).to(device)
lr = 1e-2

In [11]:
train_dataset = TrainSet()
test_dataset = TestSet()
test_dataset.convert_word_to_index(train_dataset.entity_to_index, train_dataset.relation_to_index,
                                  test_dataset.raw_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=True)
transe = Transe(train_dataset.entity_num, train_dataset.relation_num, emb_dim=emb_dim,
              gamma=gamma).to(device)
optimizer = optim.SGD(transe.parameters(), lr=lr, momentum=0)
for epoch in range(50):
    # e <= e / ||e||
    entity_norm = torch.norm(transe.entity_embedding.weight.data, dim=1, keepdim=True)
    transe.entity_embedding.weight.data = transe.entity_embedding.weight.data / entity_norm
    total_loss = 0
    for batch_idx, (pos, neg) in enumerate(train_loader):
        pos, neg = pos.to(device), neg.to(device)
        # pos: [batch_size, 3] => [3, batch_size]
        pos = torch.transpose(pos, 0, 1)
        # pos_head, pos_relation, pos_tail: [batch_size]
        pos_head, pos_relation, pos_tail = pos[0], pos[1], pos[2]
        neg = torch.transpose(neg, 0, 1)
        # neg_head, neg_relation, neg_tail: [batch_size]
        neg_head, neg_relation, neg_tail = neg[0], neg[1], neg[2]

        pod, nd = transe(pos_head, pos_relation, pos_tail, neg_head, neg_relation, neg_tail)
        loss = transe.loss(pod, nd)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"epoch {epoch+1}, loss = {total_loss/train_dataset.__len__()}")
    corrct_test = 0
    for batch_idx, data in enumerate(test_loader):
        data = data.to(device)
        # data: [batch_size, 3] => [3, batch_size]
        data = torch.transpose(data, 0, 1)
        corrct_test += transe.top_k(data[0], data[1], data[2], k=10)
    print(f"===>epoch {epoch+1}, test accuracy {corrct_test/test_dataset.__len__()}")

Train set: 14951 entities, 1345 relations, 483142 triplets.
Test set: 59071 triplets
epoch 1, loss = 0.8066828239219257
===>epoch 1, test accuracy 0.2117451879941088
epoch 2, loss = 0.623106359329608
===>epoch 2, test accuracy 0.24220006432936636
epoch 3, loss = 0.4832910496918213
===>epoch 3, test accuracy 0.26332718254304144
epoch 4, loss = 0.3673590011021406
===>epoch 4, test accuracy 0.2849790929559344
epoch 5, loss = 0.2882146504155964
===>epoch 5, test accuracy 0.29809889793638167
epoch 6, loss = 0.23809900558151126
===>epoch 6, test accuracy 0.3081376648440013
epoch 7, loss = 0.2051352477770555
===>epoch 7, test accuracy 0.3165681975927274
epoch 8, loss = 0.18222817386360943
===>epoch 8, test accuracy 0.32247634202908365
epoch 9, loss = 0.16523460143927204
===>epoch 9, test accuracy 0.32831677134296017
epoch 10, loss = 0.1521537794412969
===>epoch 10, test accuracy 0.33036515379797193
epoch 11, loss = 0.1417125325221378
===>epoch 11, test accuracy 0.3354268592033316
epoch 12, lo

KeyboardInterrupt: ignored

In [None]:
import torch
A= torch.tensor([[1,2,3], [2,3,4]], dtype=torch.float64)
torch.norm(A, dim=1, keepdim=True)

tensor([[3.7417],
        [5.3852]], dtype=torch.float64)

In [None]:
A= nn.Embedding(4,10)
nn.init.normal_(A.weight, std=3)

Parameter containing:
tensor([[ 1.0900, -0.7703,  6.1650,  1.1643,  5.0615, -1.0738,  0.1473,  2.8213,
          2.4788,  3.3891],
        [-1.1225, -2.8458, -2.5840, -3.0610, -2.7979,  2.6933, -1.8898, -6.2910,
         -2.1104, -1.6318],
        [ 2.9061,  1.2026, -3.0733, -1.2712, -2.9923, -3.0825,  3.9839,  1.6100,
          3.4436, -3.6691],
        [-7.2106, -1.0447, -3.5175, -2.6805,  2.6255,  2.0468, -0.8045, -3.1000,
          1.7648,  1.9586]], requires_grad=True)