In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3" 
import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb


wandb.login()

# os.environ["WANDB_DISABLED"] = "true"



# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cuda:2") if torch.cuda.is_available() else torch.device("cpu")


[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
max_source_length = 1024
max_target_length = 128

In [4]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [5]:
training_data = pd.read_csv('./2-NEL_Data/2-csv_format_2/training_data.csv')
training_data = training_data.sample(frac=1, random_state=1)
training_data

Unnamed: 0,qid,question,entity,wikidata_reply
27012,Q336181,**what program is a variety show?**,**variety show**,"**[[Q336181, variety show, entertainment made ..."
28822,Q1343008,**What genre of program is hakushaku to yōsei?**,**hakushaku to yōsei**,"**[[Q1343008, Hakushaku to Yōsei, media franch..."
33361,Q48892,**what team does drogba play for 2013?**,**drogba**,"**[[Q48892, Didier Drogba, Ivorian association..."
27625,Q5271417,**what is diane chamberlain's birthplace?**,**diane chamberlain**,"**[[Q5271417, Diane Chamberlain, American writ..."
17627,Q8093,**What's a game published by nintendo**,**nintendo**,"**[[Q8093, Nintendo, Japanese multinational vi..."
...,...,...,...,...
7813,Q7696995,**which program is in the tv genre television ...,**television comedy**,"**[[Q7696995, television comedy, television ge..."
32511,Q670376,**where did the arizona diamondbacks play?**,**arizona diamondbacks**,"**[[Q670376, Arizona Diamondbacks, baseball te..."
5192,Q6176201,**Where was jeffrey p. buzen born**,**jeffrey p. buzen**,"**[[Q6176201, Jeffrey P. Buzen, American compu..."
12172,Q2702756,**who is a publisher of the computer game supe...,**super bomberman**,"**[[Q2702756, Super Bomberman, 1993 Super NES ..."


In [6]:
input_text = list(training_data['question'] + ',' + training_data['entity'] + ',' + training_data['wikidata_reply'])
input_text[0]

'**what program is a variety show?**,**variety show**,**[[Q336181, variety show, entertainment made up of a variety of acts], [Q107020026, Quyi, journal], [Q6022366, Theatre of Odeon, Theatre building that opened in 1875, at Pera, Beyoglu, İstanbul.], [Q66323848, Variety Show and Benefit Performance in Vietnam (NAID 102035872), "item in the National Archives and Records Administration\'s holdings"], [Q79312544, Variety show syndrome: making a diagnosis, scientific article published on 01 November 2003]]**'

In [7]:
target_text = list(training_data['qid'])
target_text[0]

'Q336181'

In [8]:
X_train_tokenized = tokenizer(['nel: ' + sequence for sequence in input_text], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(target_text, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

34241


In [9]:
val_data = pd.read_csv('./2-NEL_Data/2-csv_format_2/val_data.csv')
val_data = val_data.sample(frac=1, random_state=1)
val_data

Unnamed: 0,qid,question,entity,wikidata_reply
2192,Q4931117,**What country is bo lacy from?**,**bo lacy**,"**[[Q4931117, Bo Lacy, American football playe..."
1030,Q47526,**which position does football player zico play**,**zico**,"**[[Q47526, Zico, Brazilian association footba..."
447,Q4707240,**what is alan marks's gender?**,**alan marks**,"**[[Q4707240, Alan Marks, English artist and i..."
4418,Q9458,**What is one of muhammad's children's names?**,**muhammad**,"**[[Q9458, Muhammad, Arabian religious leader ..."
600,Q3181381,**where did johnny doyle die**,**johnny doyle**,"**[[Q3181381, Johnny Doyle, Scottish footballe..."
...,...,...,...,...
2895,Q206,**What is the name of an organization that was...,**stephen harper**,"**[[Q206, Stephen Harper, Canadian politician]..."
2763,Q16554,"**what female actress was born in denver, colo...",**denver**,"**[[Q16554, Denver, capital city of the state ..."
905,Q5205415,**what type of music is dj quixotic**,**dj quixotic**,"**[[Q5205415, DJ Quixotic, American DJ]]**"
3980,Q84466,**where did otto nückel die**,**otto nückel**,"**[[Q84466, Otto Nückel, painter, graphic desi..."


In [10]:
input_text_val = list(val_data['question'] + ',' + val_data['entity'] + ',' + val_data['wikidata_reply'])
input_text_val[0]

'**What country is bo lacy from?**,**bo lacy**,**[[Q4931117, Bo Lacy, American football player]]**'

In [11]:
target_text_val = list(val_data['qid'])
target_text_val[0]

'Q4931117'

In [12]:
X_val_tokenized = tokenizer(['nel: ' + sequence for sequence in input_text_val], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(target_text_val, 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(val_data))
# print(len(training_sample))

4837


In [13]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [14]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [15]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    "NEL_model_normal_2",
    evaluation_strategy ='steps',
    eval_steps = 500, # Evaluation and Save happens every 50 steps
    logging_steps = 500,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 5,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [16]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

In [17]:
trainer.train()

***** Running training *****
  Num examples = 34241
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 14270
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss
500,0.6622,0.408311
1000,0.4287,0.340432
1500,0.366,0.300719
2000,0.337,0.27435
2500,0.2825,0.251899
3000,0.2921,0.253575
3500,0.2492,0.240483
4000,0.2345,0.246655
4500,0.2384,0.22605
5000,0.2227,0.216427


***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-500
Configuration saved in NEL_model_normal_2/checkpoint-500/config.json
Model weights saved in NEL_model_normal_2/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-1000
Configuration saved in NEL_model_normal_2/checkpoint-1000/config.json
Model weights saved in NEL_model_normal_2/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-1500
Configuration saved in NEL_model_normal_2/checkpoint-1500/config.json
Model weights saved in NEL_model_normal_2/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-2000
Configuration saved in NEL_model_

Saving model checkpoint to NEL_model_normal_2/checkpoint-7000
Configuration saved in NEL_model_normal_2/checkpoint-7000/config.json
Model weights saved in NEL_model_normal_2/checkpoint-7000/pytorch_model.bin
Deleting older checkpoint [NEL_model_normal_2/checkpoint-4500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-7500
Configuration saved in NEL_model_normal_2/checkpoint-7500/config.json
Model weights saved in NEL_model_normal_2/checkpoint-7500/pytorch_model.bin
Deleting older checkpoint [NEL_model_normal_2/checkpoint-5000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 4837
  Batch size = 12
Saving model checkpoint to NEL_model_normal_2/checkpoint-8000
Configuration saved in NEL_model_normal_2/checkpoint-8000/config.json
Model weights saved in NEL_model_normal_2/checkpoint-8000/pytorch_model.bin
Deleting older checkpoint [NEL_model_normal_2/chec

TrainOutput(global_step=10000, training_loss=0.2533380584716797, metrics={'train_runtime': 8206.5496, 'train_samples_per_second': 20.862, 'train_steps_per_second': 1.739, 'total_flos': 5.579551576530432e+16, 'train_loss': 0.2533380584716797, 'epoch': 3.5})