In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


wandb.login()
%env WANDB_PROJECT= NER_full_sentence

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=NER_full_sentence


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
#     _, _, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
max_source_length = 512
max_target_length = 512

In [5]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [6]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data_with_pos.csv')
# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['input_length'] = training_data['input_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,A top official of export development bank Banc...,"*s* Bancomext | ORG *e* , *s* Mexican | MISC *e*",25,2,0.1,152
1,"In the early 1980 s , he met Héctor Elizondo w...",*s* Héctor Elizondo | PER *e*,17,2,0.1,103
2,The 316 comes from the number of one of the so...,*s* BBC | ORG *e*,28,1,0.0,136
3,Some 71 % had bought some stock in the past ye...,*s**e*,18,0,0.0,74
4,"Hence, NBC might be able to take, say, a 5 % s...","*s* NBC | ORG *e* , *s* MGM/UA | ORG *e*",18,2,0.1,79
...,...,...,...,...,...,...
152418,The annual Grand National horse race takes pla...,"*s* Grand National | MISC *e* , *s* Aintree Ra...",11,4,0.4,72
152419,I think we have to step back take a hard look ...,*s**e*,14,0,0.0,63
152420,Other disease-causing bacteria in this family ...,"*s* Enterobacter | MISC *e* , *s* Citrobacter ...",10,2,0.2,92
152421,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge | ORG *e*,9,2,0.2,27


In [7]:
training_data = training_data.drop(training_data[training_data['input_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,A top official of export development bank Banc...,"*s* Bancomext | ORG *e* , *s* Mexican | MISC *e*",25,2,0.1,152
1,"In the early 1980 s , he met Héctor Elizondo w...",*s* Héctor Elizondo | PER *e*,17,2,0.1,103
2,The 316 comes from the number of one of the so...,*s* BBC | ORG *e*,28,1,0.0,136
3,Some 71 % had bought some stock in the past ye...,*s**e*,18,0,0.0,74
4,"Hence, NBC might be able to take, say, a 5 % s...","*s* NBC | ORG *e* , *s* MGM/UA | ORG *e*",18,2,0.1,79
...,...,...,...,...,...,...
152418,The annual Grand National horse race takes pla...,"*s* Grand National | MISC *e* , *s* Aintree Ra...",11,4,0.4,72
152419,I think we have to step back take a hard look ...,*s**e*,14,0,0.0,63
152420,Other disease-causing bacteria in this family ...,"*s* Enterobacter | MISC *e* , *s* Citrobacter ...",10,2,0.2,92
152421,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge | ORG *e*,9,2,0.2,27


In [8]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

152295


In [9]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data_with_pos.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['input_length'] = validation_data['input_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,Taiwan firms are now the most important source...,"*s* Taiwan | MISC *e* , *s* Dongguan | MISC *e*",13,2,0.2,81
1,"Results of English , Scottish and","*s* English | MISC *e* , *s* Scottish | MISC *e*",5,2,0.4,33
2,"However , because of the intervention of World...",*s* World War II | MISC *e*,29,3,0.1,183
3,"In ensuing seasons , his stock continued to ri...",*s* Finland | LOC *e*,19,1,0.1,105
4,"TREASURY BILLS : Results of the Monday, Octobe...","*s* U.S. | MISC *e* , *s* TREASURY | ORG *e*",42,2,0.0,210
...,...,...,...,...,...,...
24871,"In March 2012 , he was fined for parking illeg...",*s* Olympiacos | ORG *e*,14,1,0.1,80
24872,They want a winner.,*s**e*,4,0,0.0,19
24873,"None of them would , so he became another pres...",*s* Nazis | ORG *e*,21,1,0.0,149
24874,The three American jumpers had been easily the...,*s* American | MISC *e*,13,1,0.1,80


In [10]:
validation_data = validation_data.drop(validation_data[validation_data['input_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,Taiwan firms are now the most important source...,"*s* Taiwan | MISC *e* , *s* Dongguan | MISC *e*",13,2,0.2,81
1,"Results of English , Scottish and","*s* English | MISC *e* , *s* Scottish | MISC *e*",5,2,0.4,33
2,"However , because of the intervention of World...",*s* World War II | MISC *e*,29,3,0.1,183
3,"In ensuing seasons , his stock continued to ri...",*s* Finland | LOC *e*,19,1,0.1,105
4,"TREASURY BILLS : Results of the Monday, Octobe...","*s* U.S. | MISC *e* , *s* TREASURY | ORG *e*",42,2,0.0,210
...,...,...,...,...,...,...
24871,"In March 2012 , he was fined for parking illeg...",*s* Olympiacos | ORG *e*,14,1,0.1,80
24872,They want a winner.,*s**e*,4,0,0.0,19
24873,"None of them would , so he became another pres...",*s* Nazis | ORG *e*,21,1,0.0,149
24874,The three American jumpers had been easily the...,*s* American | MISC *e*,13,1,0.1,80


In [11]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

24862


In [12]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [13]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [14]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )


training_args = Seq2SeqTrainingArguments(
    "NER_normal_with_pos_v3",
#     evaluation_strategy ='epoch',
    evaluation_strategy ='steps',
    eval_steps = 2380, # Evaluation and Save happens every 3743 steps (steps/(num of epochs*2))
#     logging_steps = 500,
    save_steps = 2380,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    
    gradient_accumulation_steps =2,
    
    
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 10,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [15]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
#     compute_metrics=compute_metrics,
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 152295
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 47590
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss
2380,0.0038,0.003862
4760,0.0031,0.003991
7140,0.0024,0.003431
9520,0.0025,0.003293
11900,0.0019,0.003166
14280,0.0019,0.00293
16660,0.0014,0.002946
19040,0.0014,0.002783


***** Running Evaluation *****
  Num examples = 24862
  Batch size = 16
Saving model checkpoint to NER_normal_with_pos_v3/checkpoint-2380
Configuration saved in NER_normal_with_pos_v3/checkpoint-2380/config.json
Model weights saved in NER_normal_with_pos_v3/checkpoint-2380/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24862
  Batch size = 16
Saving model checkpoint to NER_normal_with_pos_v3/checkpoint-4760
Configuration saved in NER_normal_with_pos_v3/checkpoint-4760/config.json
Model weights saved in NER_normal_with_pos_v3/checkpoint-4760/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24862
  Batch size = 16
Saving model checkpoint to NER_normal_with_pos_v3/checkpoint-7140
Configuration saved in NER_normal_with_pos_v3/checkpoint-7140/config.json
Model weights saved in NER_normal_with_pos_v3/checkpoint-7140/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24862
  Batch size = 16
Saving model checkpoint to NER_normal_with_pos_v3/ch

In [17]:
print('finished')

finished


In [18]:
# Num examples = 317864
# Num Epochs = 3
# Instantaneous batch size per device = 8
# Total train batch size (w. parallel, distributed & accumulation) = 32
# Gradient Accumulation steps = 1
# Total optimization steps = 29802

In [19]:
### ner normal ###
# number of steps -> 29802
# epochs -> 6
# patience -> 5
# eval_steps = 1000
# save_steps = 1000