In [1]:
import math
import torch
import pandas as pd
from operator import itemgetter
import random
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import linalg as LA

In [6]:
# maybe use named_tuples to represent the (h,r,t,h',t')
if torch.cuda.is_available():  
  device = "cuda:0" 
else:  
  device = "cpu"
# device

In [7]:
def read_file(path, header, seperator):
  return pd.read_csv(path, header= header, sep=seperator)

In [8]:
# chose the dimension to be d=1000
def convert_to_dict(text_data):
  '''
  convert read file from csv to dict, where key is the data read from the csv file and the value is a tensor of size 1000 (maybe later make size as a function parameter)
  '''
  output_dict = dict()
#     torch.split(temp1, 1, dim=1)    
  matrix=torch.rand(1000, len(text_data))
  vectors = torch.split(matrix, 1, dim=1)
    
  for i in range(len(text_data)):
#         temp = torch.reshape(vectors[i][0], (-1, 1))
      output_dict[text_data[i]] = vectors[i]/torch.linalg.vector_norm(vectors[i])
      output_dict[text_data[i]].requires_grad=True
  return output_dict

In [9]:
def read_training_data(path, header, seperator):
    '''
    returns a list of sorted (according to relation) training instances (triplets) 
    i decided to return a sorted list, because i believe it improves the effeciency while generating the false samples
    '''
    data = pd.read_csv(path, header=header, sep=seperator)
    training_data = data.values.tolist()
    training_data = sorted(training_data, key=itemgetter(1)) 
    return training_data

In [10]:
def is_relation_false(training_data, sample):
  '''
  assuming a closed world world assumption
  check if the false sample (sample) exist in the training data (meaning it is a true relation)
  '''
  for i in range(len(training_data)):
      unique = True
      if sample[0] == training_data[i][0] and sample[1] == training_data[i][2]:
          unique = False
          break
                    
      if training_data[i][1] != training_data[0][1]:
          break
  return unique

In [11]:
def is_relation_in_samples(generated_samples, sample_to_test):
  '''
  check if generated false sample was already generated in a pervious iteration (to avoid repetitions of false samples)
  '''
  generated = False
  for sample in generated_samples:
      if sample_to_test[0] == sample[0] and sample_to_test[1] == sample[1]:
          generated = True
  return generated

In [12]:
def generate_false_samples(entities_dict, positive_sample, training_data, index_start_of_relation, num_needed_samples):
  '''
  positive_sample: represents the sample we want to generate the false samples for,  form: [h, r, t]
  
  training_data: all the training data sorted according to the relation (h, r, t)
  
  index_start_of_relation: represents the index in the training_data, where the samples in the training set has the same relation (r) as the relation in the positive sample

  num_needed_samples: how many false smaples to generate
  '''
  counter = 0  # to avoid running in an infinite loop, if there are not enough false relations between the entities to generate the false samples needed
  generated_samples = []  # generated false samples

  while len(generated_samples) < num_needed_samples and counter < 30000:

      head_or_tail = random.randint(0, 1)  # do decide whether to replace the head or the tail in the false sample
      random_entity = random.choice(entities_dict)  # choose a random entity
      counter += 1
        
        # 1 for head
      if(head_or_tail):
          negative_sample = [random_entity, positive_sample[2]]    
      else:
          negative_sample = [positive_sample[0], random_entity]

      relation_is_false = is_relation_false(training_data[index_start_of_relation:], negative_sample)
      relation_already_generated = is_relation_in_samples(generated_samples, negative_sample)
            
      if is_relation_false and not(relation_already_generated):
          generated_samples.append(negative_sample)
    
  positive_sample.extend(generated_samples)  # [h, r, t, [generated_sample_1], [generated_sample_2], ... [generated_sample_n]]
  return positive_sample  
        

In [13]:
def is_false_samples_generated():
  '''
  checks if training data with false samples inclded were generated and saved in a previous run of the program
  '''
  extended_data = Path("/content/extended_training_data.txt")
  return extended_data.is_file()

In [14]:
# def encoding_training_data(entity_dict, relation_dict, training_data):
#     for i in range(len(training_data)):
#         for j in range(len(training_data[i])):
#             if j == 1:
#                 training_data[i][j] = relation_dict[training_data[i][j]]
#             elif j < 3:
#                 training_data[i][j] = entity_dict[training_data[i][j]]
#             else:
#                 training_data[i][j] = training_data[i][j].strip("[]").split(",")
#                 training_data[i][j] = [entity_dict[int(training_data[i][j][0])], entity_dict[int(training_data[i][j][1])]]
#     return training_data

In [15]:
def read_extended_training_data(training_data):
  '''
  gets text file of extended training data (training data that includes false samples)
  and returns a list of training samples in the form [h, r, t, [generated_sample_1], [generated_sample_2], ... [generated_sample_n]]
  '''
  for i in range(len(training_data)):
      for j in range(3, len(training_data[i])):
          training_data[i][j] = training_data[i][j].strip("[]").split(",")
          training_data[i][j][0] = int(training_data[i][j][0]) 
          training_data[i][j][1] = int(training_data[i][j][1])
  return training_data

In [16]:
def reshape_training_data(training_data):
  '''
  reshapes training data samples to the form
  [h, r, t, [false_heads], [false_tails]]
  '''
  for i in range(len(training_data)):
    sample = []
    new_training_ex_1 =[]
    new_training_ex_2 =[]
    sample.extend(training_data[i][0:3])
    for j in range(3, len(training_data[i])):
      new_training_ex_1.append(training_data[i][j][0])
      new_training_ex_2.append(training_data[i][j][1])
    sample.append(new_training_ex_1)
    sample.append(new_training_ex_2)
    training_data[i] = sample
  return training_data
# print(sample)

In [19]:
entities_temp_list = read_file('/content/entities.dict', header=None, seperator='\t')[1]
entities = convert_to_dict(entities_temp_list)

In [20]:
relation_temp_list = read_file('/content/relations.dict', header=None, seperator='\t')[1]
relations = convert_to_dict(relation_temp_list)
# relations

In [129]:
# # training_data = read_training_data('./data/wn18rr/train.txt', header=None, seperator='\t')
# training_data = read_training_data('./content/train.txt', header=None, seperator='\t')
# training_data[0:10]

In [22]:
# oprimaly false_samples_count should be 20, so num_needed_samples= 20
if is_false_samples_generated():
    print("found file")
    read_training_data = pd.read_csv('/content/extended_training_data.txt', header=None, sep='\t')
    training_data = read_training_data.values.tolist()
    training_data = read_extended_training_data(training_data)

    # should add an if statement to check shape
    training_data = reshape_training_data(training_data)
else:

  # training_data = read_training_data('./data/wn18rr/train.txt', header=None, seperator='\t')
  training_data = read_training_data('./content/train.txt', header=None, seperator='\t')
  training_data[0:10]
  for i  in range(len(training_data)):
      print(i)
      if not (training_data[i][1] == training_data[i-1][1]):
          index_start_of_relation = i
          print("new index: ", index_start_of_relation)
          print(training_data[i][1])
                        
      training_data[i] = generate_false_samples(entities_temp_list, 
                                              training_data[i], 
                                              training_data, 
                                              index_start_of_relation,
                                              num_needed_samples=10)
    
  textfile = open("/content/extended_training_data.txt", "w")
  for row in training_data:
      for i in range(len(row)):
          if i != len(row)-1:
              textfile.write(str(row[i]) + "\t")
          else:
              textfile.write(str(row[i]))
      textfile.write("\n")
  textfile.close()

found file


In [23]:
# check if any of training_data samples has a different len 
# ex. = 13 [h, r, t, [generated_sample_1], [generated_sample_2], ... [generated_sample_10]] 

# counter = 0
# for i in training_data:
#     if len(i) != 13:
#         counter +=1
# print(counter)

In [24]:
# difference = []
# ex_1 = [torch.tensor([2,2,3,4]), torch.tensor([1,2,3,4])]
# ex_2 = [torch.tensor([3,2,3,4]), torch.tensor([1,2,3,4])]
# zip_object = zip(sample[4], sample[5])
# for head, tail in zip_object:
#     difference.append(head-tail)
# print(difference)
# # print([tensy for tensy in ex_2])
# # print([(torch.subtract(tensy, ex_1)) for tensy in ex_2])


In [25]:
class KGEDataset(Dataset):
  def __init__(self, training_data):
        # Initialize data, download, etc.
        self.training_data = training_data
        self.n_samples = len(training_data)


  # support indexing such that dataset[i] can be used to get i-th sample
  def __getitem__(self, index):
    return training_data[index]

  def __len__(self):
    return self.n_samples

In [26]:
class KGEmodel(nn.Module):
  def __init__(self, entities, relations):
    super(KGEmodel, self).__init__()
    self.w = entities.copy()   # start with keys and values of entities
    self.w.update(relations)    # modifies w with keys and values of relations

    # ParameterDict didn't work for now (possibly try again later) (looks more elegant)
    # self.entities = nn.ParameterDict(ent)
    # self.relations = nn.ParameterDict(rel)
  
  def forward(self, mini_batch):
    rearranged_samples = []
    for i in range(len(mini_batch[0])):
      sample=[]
      for j in range(3):
        # sample.append(mini_batch[j][0][i])
        sample.append(mini_batch[j][i])

      false_head=[]
      false_tail=[]
      for j in range(len(mini_batch[3])):
        false_head.append(mini_batch[3][j][i])
        false_tail.append(mini_batch[4][j][i])

      sample.append(false_head)
      sample.append(false_tail)
      rearranged_samples.append(sample)
    result = []
    for sample in rearranged_samples:
      encoded_sample = []
      encoded_sample.append(self.w[int(sample[0])].to(device))
      encoded_sample.append(self.w[str(sample[1])].to(device))
      encoded_sample.append(self.w[int(sample[2])].to(device))
      false_h_encoded = []
      false_t_encoded = []
      for head in sample[3]:
        false_h_encoded.append(self.w[int(head)].to(device))
    
      for tail in sample[4]:
        false_t_encoded.append(self.w[int(tail)].to(device))
    
      encoded_sample.append(false_h_encoded)
      encoded_sample.append(false_t_encoded)

      result.append(encoded_sample)
    return result 
      

In [27]:
def loss_func_new(margin, mini_batch):
  loss = 0
  for sample in mini_batch:
    positive_score = LA.norm(sample[0] + sample[1] - sample[2])
    negative_scores = [LA.norm(head + sample[1] - tail) for head, tail in zip(sample[3], sample[4])]
    
    loss += sum([max(0, margin - positive_score + value) for value in negative_scores])

  return loss

In [43]:
batch_size = 256

model = KGEmodel(entities, relations)
model= model.to(device)

optimizer = torch.optim.SGD(list(model.w.values()), lr=0.01)

dataset = KGEDataset(training_data)

# margin = 1, dimension d = 20000
train_loader = DataLoader(dataset=dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)


In [44]:
# data = iter(train_loader).next()
# result = model(data)

In [49]:
# Training loop
f = open("/content/training_loss.txt", "w")


num_epochs = 6
total_samples = len(training_data)
n_iterations = math.ceil(total_samples/batch_size)
total_loss=0
# print(total_samples, n_iterations)
for epoch in range(num_epochs):
    for i, data in enumerate(train_loader):
      optimizer.zero_grad()

      encoding = model(data)
      loss = loss_func_new(0.1, encoding)

      total_loss += loss


      loss.backward()
      optimizer.step()
      
      if (i+1) % 10 == 0:
        f.write(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_iterations}], Total Loss: {total_loss:.4f}, Loss: {loss:.4f} \n')

        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_iterations}], Total Loss: {total_loss:.4f}, Loss: {loss:.4f}')
        total_loss = 0
f.close()

Epoch [1/6], Step [10/340], Total Loss: 22.8005, Loss: 1.7635
Epoch [1/6], Step [20/340], Total Loss: 24.3342, Loss: 1.6415
Epoch [1/6], Step [30/340], Total Loss: 25.1003, Loss: 2.9208
Epoch [1/6], Step [40/340], Total Loss: 22.1082, Loss: 2.0500
Epoch [1/6], Step [50/340], Total Loss: 22.8062, Loss: 2.6120
Epoch [1/6], Step [60/340], Total Loss: 22.1108, Loss: 1.9210
Epoch [1/6], Step [70/340], Total Loss: 26.0997, Loss: 3.0515
Epoch [1/6], Step [80/340], Total Loss: 22.2122, Loss: 1.8348
Epoch [1/6], Step [90/340], Total Loss: 31.4027, Loss: 3.0062
Epoch [1/6], Step [100/340], Total Loss: 31.4812, Loss: 2.7533
Epoch [1/6], Step [110/340], Total Loss: 35.5111, Loss: 2.0464
Epoch [1/6], Step [120/340], Total Loss: 27.2865, Loss: 2.4893
Epoch [1/6], Step [130/340], Total Loss: 23.6646, Loss: 2.3906
Epoch [1/6], Step [140/340], Total Loss: 26.5127, Loss: 2.2918
Epoch [1/6], Step [150/340], Total Loss: 24.7136, Loss: 1.9691
Epoch [1/6], Step [160/340], Total Loss: 25.3134, Loss: 2.1863
E

In [50]:
torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
            }, "KGE_model_transE")
torch.save(model.state_dict(), "KGE_model_transE_state")