In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb


wandb.login()

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
max_source_length = 512
max_target_length = 128

In [4]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [5]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data.csv')
training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['input_length'] = training_data['input_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"in february 2013 , beyoncé said that madonna i...","*beyoncé*,*madonna*",,,,93
1,"The album made its debut at # 8 on the "" Billb...","*Billboard*,*Jewel*,*United States*",31.0,4.0,0.1,165
2,"he was a politician , one of the first french ...","*french*,*british*",,,,100
3,andrea collinelli ( italy ) 4:16.141 beat fran...,"*andrea collinelli*,*italy*,*francis moreau*,*...",,,,76
4,concerned experts from the reform and developm...,"*hainan-rrb-*,*hainan*",,,,205
...,...,...,...,...,...,...
318105,He was fired halfway through the 2005 season a...,*Jerry Narron*,13.0,2.0,0.2,75
318106,Palmeiras 5 2 3 0 8 1 9,*Palmeiras*,8.0,1.0,0.1,23
318107,In 1991 the IFAB made an addition which deemed...,*IFAB*,31.0,1.0,0.0,184
318108,he attended the fenway park 100th anniversary ...,*fenway park*,,,,78


In [6]:
training_data = training_data.drop(training_data[training_data['input_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"in february 2013 , beyoncé said that madonna i...","*beyoncé*,*madonna*",,,,93
1,"The album made its debut at # 8 on the "" Billb...","*Billboard*,*Jewel*,*United States*",31.0,4.0,0.1,165
2,"he was a politician , one of the first french ...","*french*,*british*",,,,100
3,andrea collinelli ( italy ) 4:16.141 beat fran...,"*andrea collinelli*,*italy*,*francis moreau*,*...",,,,76
4,concerned experts from the reform and developm...,"*hainan-rrb-*,*hainan*",,,,205
...,...,...,...,...,...,...
318105,He was fired halfway through the 2005 season a...,*Jerry Narron*,13.0,2.0,0.2,75
318106,Palmeiras 5 2 3 0 8 1 9,*Palmeiras*,8.0,1.0,0.1,23
318107,In 1991 the IFAB made an addition which deemed...,*IFAB*,31.0,1.0,0.0,184
318108,he attended the fenway park 100th anniversary ...,*fenway park*,,,,78


In [7]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

317864


In [8]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['input_length'] = validation_data['input_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"The average yield on six-month CDs of $ 50,000...","*six-month*,*the week ended Tuesday*,*New York...",32,5,0.2,177
1,"Pilate said, `` So you are a king. ''",**,9,0,0.0,37
2,Mr. Rosen said the quake will revive consumer ...,"*1972*,*Rosen*",32,2,0.1,189
3,To recover from the trauma of killing another ...,*Amazon*,19,1,0.1,111
4,"Add Women 's singles , third round Lisa Raymon...","*Lisa Raymond*,*U.S.*,*Kimberly Po*,*U.S.*",15,6,0.4,90
...,...,...,...,...,...,...
26546,"Previously , WJZ-TV carried the team from thei...","*WJZ-TV*,*Baltimore*",14,2,0.1,91
26547,"According to presentations, the quality of Chi...",*China*,17,1,0.1,111
26548,Hindenburg refused the powers but agreed to th...,*Hindenburg*,9,1,0.1,58
26549,He finished fourth in the Olympics that year .,*Olympics*,8,1,0.1,46


In [9]:
validation_data = validation_data.drop(validation_data[validation_data['input_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,input_length
0,"The average yield on six-month CDs of $ 50,000...","*six-month*,*the week ended Tuesday*,*New York...",32,5,0.2,177
1,"Pilate said, `` So you are a king. ''",**,9,0,0.0,37
2,Mr. Rosen said the quake will revive consumer ...,"*1972*,*Rosen*",32,2,0.1,189
3,To recover from the trauma of killing another ...,*Amazon*,19,1,0.1,111
4,"Add Women 's singles , third round Lisa Raymon...","*Lisa Raymond*,*U.S.*,*Kimberly Po*,*U.S.*",15,6,0.4,90
...,...,...,...,...,...,...
26546,"Previously , WJZ-TV carried the team from thei...","*WJZ-TV*,*Baltimore*",14,2,0.1,91
26547,"According to presentations, the quality of Chi...",*China*,17,1,0.1,111
26548,Hindenburg refused the powers but agreed to th...,*Hindenburg*,9,1,0.1,58
26549,He finished fourth in the Olympics that year .,*Olympics*,8,1,0.1,46


In [10]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

26540


In [11]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [12]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [13]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )

training_args = Seq2SeqTrainingArguments(
    "NER_lower_normal_2",
    evaluation_strategy ='steps',
    eval_steps = 1000, # Evaluation and Save happens every 500 steps
    save_steps = 1000,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 3,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 317864
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 29802
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss
1000,0.0278,0.019054
2000,0.0228,0.016653
3000,0.0204,0.01411
4000,0.0183,0.01279
5000,0.0171,0.011849
6000,0.0161,0.010672
7000,0.0155,0.010231
8000,0.0144,0.009883
9000,0.0135,0.00949
10000,0.013,0.008934


***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-1000
Configuration saved in NER_lower_normal_2/checkpoint-1000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-2000
Configuration saved in NER_lower_normal_2/checkpoint-2000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-3000
Configuration saved in NER_lower_normal_2/checkpoint-3000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-4000
Configuration saved in NER

  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-14000
Configuration saved in NER_lower_normal_2/checkpoint-14000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-14000/pytorch_model.bin
Deleting older checkpoint [NER_lower_normal_2/checkpoint-9000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-15000
Configuration saved in NER_lower_normal_2/checkpoint-15000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-15000/pytorch_model.bin
Deleting older checkpoint [NER_lower_normal_2/checkpoint-10000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-16000
Configuration saved in NER_lower_normal_2/checkpoint-16000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-16000/pytorch_model.bin
Deleting older check

Deleting older checkpoint [NER_lower_normal_2/checkpoint-21000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-27000
Configuration saved in NER_lower_normal_2/checkpoint-27000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-27000/pytorch_model.bin
Deleting older checkpoint [NER_lower_normal_2/checkpoint-22000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-28000
Configuration saved in NER_lower_normal_2/checkpoint-28000/config.json
Model weights saved in NER_lower_normal_2/checkpoint-28000/pytorch_model.bin
Deleting older checkpoint [NER_lower_normal_2/checkpoint-23000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 26540
  Batch size = 32
Saving model checkpoint to NER_lower_normal_2/checkpoint-29000
Configuration saved 

TrainOutput(global_step=29802, training_loss=0.011344199708288736, metrics={'train_runtime': 50123.3263, 'train_samples_per_second': 19.025, 'train_steps_per_second': 0.595, 'total_flos': 2.5745762425761792e+17, 'train_loss': 0.011344199708288736, 'epoch': 3.0})

In [16]:
print('finished')

finished


In [None]:
# Num examples = 317864
# Num Epochs = 3
# Instantaneous batch size per device = 8
# Total train batch size (w. parallel, distributed & accumulation) = 32
# Gradient Accumulation steps = 1
# Total optimization steps = 29802

In [17]:
### ner normal ###
# number of steps -> 29802
# epochs -> 6
# patience -> 5
# eval_steps = 1000
# save_steps = 1000