In [None]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" 
import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb
import requests
import time

wandb.login()
%env WANDB_PROJECT= Relation_Lining
# os.environ["WANDB_DISABLED"] = "true"



device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
# device = torch.device('cuda:3')

In [None]:
def get_relation_label(rel_id):
    label = ''
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels',
        'ids': ''
    }
      
    params['ids'] = str(rel_id)

    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        label = response[str(rel_id)]['labels']['en']['value']
    except:
        return label
    return label

In [None]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests

def parallal_task(func, iterable, *params): 
    with open(f'./tmp_func.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func.py')
    return res

In [None]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
max_source_length = 4048
max_target_length = 128

In [None]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [None]:
training_data = pd.read_csv('./3-Relation_Linking_Data/1-csv_format/training_data.csv')
training_data = training_data.dropna()
training_data.info()

In [None]:
# start = time.time()
# training_data['target relation label'] = parallal_task(get_relation_label, list(training_data['target relation id']))
# print(time.time() - start)
# training_data.to_csv('./3-Relation_Linking_Data/1-csv_format/training_data.csv', index=False)

In [None]:
training_data['input_text'] = '**' + training_data['question'] + '**,**' + training_data['relation labels'] + '**'

In [None]:
import matplotlib.pyplot as plt
plt.hist(training_data['input_text'].str.len(), bins = 20)
plt.show()

In [None]:
len(training_data['input_text'][training_data['input_text'].str.len() > 1800])*100/len(training_data['input_text'])

In [None]:
len(training_data[training_data['input_text'].str.len() <= 4048])/len(training_data)

In [None]:
training_data = training_data[training_data['input_text'].str.len() <= 2024]
training_data = training_data.sample(frac=1, random_state=1)
training_data

In [None]:
input_text = list(training_data['input_text'])
input_text[0]

In [None]:
target_text = list(training_data['target relation label'].astype(str))
target_text[0]

In [None]:
X_train_tokenized = tokenizer(['relation_linking: ' + sequence for sequence in input_text], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(target_text, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

In [None]:
validation_data = pd.read_csv('./3-Relation_Linking_Data/1-csv_format/validation_data.csv')
validation_data
print(len(validation_data))
validation_data = validation_data.dropna()

In [None]:
# start = time.time()
# validation_data['target relation label'] = parallal_task(get_relation_label, list(validation_data['target relation id']))
# print(time.time() - start)
# validation_data.to_csv('./3-Relation_Linking_Data/1-csv_format/validation_data.csv', index=False)

In [None]:
validation_data['input_text'] = '**' + validation_data['question'] + '**,**' + validation_data['relation labels'] + '**'
validation_data = validation_data[validation_data['input_text'].str.len() <= 1700]
validation_data = validation_data.sample(frac=1, random_state=1)

In [None]:
input_text_val = validation_data['input_text']

In [None]:
target_text_val = list(validation_data['target relation label'].astype(str))
target_text_val[0]

In [None]:
X_val_tokenized = tokenizer(['relation_linking: ' + sequence for sequence in input_text_val], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(target_text_val, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [None]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    "Relation_linking_without_entity_higher_patience",
    evaluation_strategy ='steps',
    eval_steps = 100, # Evaluation and Save happens every 50 steps
    logging_steps = 100,
    save_steps = 100,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 5,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
#     callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    callbacks = [EarlyStoppingCallback(early_stopping_patience=6)]
)

In [None]:
trainer.train()

In [None]:
print('finished')