In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb


wandb.login()
%env WANDB_PROJECT= NER_v2

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=NER_v2


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
max_source_length = 512
max_target_length = 256

In [4]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [5]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data.csv')
# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['input_length'] = training_data['input_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,He served primarily as a utility infielder dur...,"*Major League Baseball*,*St. Louis Cardinals*",25,6,0.2,155
1,He left alongside top scorer and former captai...,"*Fernando Cavenaghi*,*Daniel Passarella*",19,4,0.2,150
2,"In middle age, only if you swim against the cu...",**,23,0,0.0,137
3,The SecuROM software also caused some virus sc...,*SecuROM*,17,1,0.1,113
4,"From January 2020 , the omnibus moved to ITV3 .",*ITV3*,8,1,0.1,47
...,...,...,...,...,...,...
157319,The annual Grand National horse race takes pla...,"*Grand National*,*Aintree Racecourse*",11,4,0.4,72
157320,"Mr. Azoff resigned as head of MCA Records, a u...","*September*,*MCA Records*,*MCA Inc.*,*Warner*,...",27,3,0.1,142
157321,Other disease-causing bacteria in this family ...,"*Enterobacter*,*Citrobacter*",10,2,0.2,92
157322,Cercle Brugge 4 0 3 1 4 5 3,*Cercle Brugge*,9,2,0.2,27


In [6]:
training_data = training_data.drop(training_data[training_data['input_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,He served primarily as a utility infielder dur...,"*Major League Baseball*,*St. Louis Cardinals*",25,6,0.2,155
1,He left alongside top scorer and former captai...,"*Fernando Cavenaghi*,*Daniel Passarella*",19,4,0.2,150
2,"In middle age, only if you swim against the cu...",**,23,0,0.0,137
3,The SecuROM software also caused some virus sc...,*SecuROM*,17,1,0.1,113
4,"From January 2020 , the omnibus moved to ITV3 .",*ITV3*,8,1,0.1,47
...,...,...,...,...,...,...
157319,The annual Grand National horse race takes pla...,"*Grand National*,*Aintree Racecourse*",11,4,0.4,72
157320,"Mr. Azoff resigned as head of MCA Records, a u...","*September*,*MCA Records*,*MCA Inc.*,*Warner*,...",27,3,0.1,142
157321,Other disease-causing bacteria in this family ...,"*Enterobacter*,*Citrobacter*",10,2,0.2,92
157322,Cercle Brugge 4 0 3 1 4 5 3,*Cercle Brugge*,9,2,0.2,27


In [7]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

157213


In [8]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['input_length'] = validation_data['input_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"I was always curious what they, the mother and...",**,17,0,0.0,90
1,"In prehistoric times , the region that was to ...","*Assyria*,*Subartu*,*Neanderthal*,*Shanidar Cave*",29,5,0.2,165
2,But Jesus knew what happened.,**,5,0,0.0,29
3,He would only appear in four games in two seas...,"*AD Ceuta*,*Lorca Deportiva CF*",25,5,0.2,150
4,Cable television is provided by Spectrum .,*Spectrum*,6,1,0.2,42
...,...,...,...,...,...,...
25917,"Previously , WJZ-TV carried the team from thei...","*WJZ-TV*,*Baltimore*",14,2,0.1,91
25918,Israeli defense officials accused Fatah of car...,"*Israeli*,*Fatah*",11,2,0.2,71
25919,Hindenburg refused the powers but agreed to th...,*Hindenburg*,9,1,0.1,58
25920,He finished fourth in the Olympics that year .,*Olympics*,8,1,0.1,46


In [9]:
validation_data = validation_data.drop(validation_data[validation_data['input_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"I was always curious what they, the mother and...",**,17,0,0.0,90
1,"In prehistoric times , the region that was to ...","*Assyria*,*Subartu*,*Neanderthal*,*Shanidar Cave*",29,5,0.2,165
2,But Jesus knew what happened.,**,5,0,0.0,29
3,He would only appear in four games in two seas...,"*AD Ceuta*,*Lorca Deportiva CF*",25,5,0.2,150
4,Cable television is provided by Spectrum .,*Spectrum*,6,1,0.2,42
...,...,...,...,...,...,...
25917,"Previously , WJZ-TV carried the team from thei...","*WJZ-TV*,*Baltimore*",14,2,0.1,91
25918,Israeli defense officials accused Fatah of car...,"*Israeli*,*Fatah*",11,2,0.2,71
25919,Hindenburg refused the powers but agreed to th...,*Hindenburg*,9,1,0.1,58
25920,He finished fourth in the Olympics that year .,*Olympics*,8,1,0.1,46


In [10]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

25913


In [11]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [12]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [13]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )

training_args = Seq2SeqTrainingArguments(
    "NER_normal",
    evaluation_strategy ='steps',
    eval_steps = 5000, # Evaluation and Save happens every 500 steps
    save_steps = 5000,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 7,
    per_device_eval_batch_size = 7,
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 12,
    report_to="wandb",
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 157213
  Num Epochs = 12
  Instantaneous batch size per device = 7
  Total train batch size (w. parallel, distributed & accumulation) = 21
  Gradient Accumulation steps = 1
  Total optimization steps = 89844
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


In [None]:
print('finished')

In [None]:
# Num examples = 317864
# Num Epochs = 3
# Instantaneous batch size per device = 8
# Total train batch size (w. parallel, distributed & accumulation) = 32
# Gradient Accumulation steps = 1
# Total optimization steps = 29802

In [None]:
### ner normal ###
# number of steps -> 29802
# epochs -> 6
# patience -> 5
# eval_steps = 1000
# save_steps = 1000