In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
# os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


wandb.login()
%env WANDB_PROJECT= NER_full_sentence

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=NER_full_sentence


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
#     _, _, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
max_source_length = 512
max_target_length = 512

In [5]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [6]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data_full_sentence.csv')
# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['target_text_length'] = training_data['target_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,He made his Millwall debut on 2 May 1998 again...,He made his <Millwall> debut on 2 May 1998 aga...,19,5,0.3,108
1,It was revised to suit adult preferences and w...,It was revised to suit adult preferences and w...,22,1,0.0,144
2,The efforts to get to the people who are addic...,The efforts to get to the people who are addic...,60,0,0.0,332
3,"In the wild , their range has contracted due t...","In the wild , their range has contracted due t...",20,1,0.0,116
4,Second day,Second day,2,0,0.0,10
...,...,...,...,...,...,...
152498,The annual Grand National horse race takes pla...,The annual <Gran National> horse race takes pl...,11,4,0.4,74
152499,"His servants said to him, `` We heard that the...","His servants said to him, `` We heard that the...",15,0,0.0,76
152500,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,96
152501,Cercle Brugge 4 0 3 1 4 5 3,<Cercl Brugge> 4 0 3 1 4 5 3,9,2,0.2,28


In [7]:
len(training_data[training_data['target_text_length']> 512])

150

In [8]:
len(training_data[training_data['target_text_length']> 512])*100/len(training_data)

0.0983587208120496

In [9]:
training_data = training_data.drop(training_data[training_data['target_text_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,He made his Millwall debut on 2 May 1998 again...,He made his <Millwall> debut on 2 May 1998 aga...,19,5,0.3,108
1,It was revised to suit adult preferences and w...,It was revised to suit adult preferences and w...,22,1,0.0,144
2,The efforts to get to the people who are addic...,The efforts to get to the people who are addic...,60,0,0.0,332
3,"In the wild , their range has contracted due t...","In the wild , their range has contracted due t...",20,1,0.0,116
4,Second day,Second day,2,0,0.0,10
...,...,...,...,...,...,...
152498,The annual Grand National horse race takes pla...,The annual <Gran National> horse race takes pl...,11,4,0.4,74
152499,"His servants said to him, `` We heard that the...","His servants said to him, `` We heard that the...",15,0,0.0,76
152500,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,96
152501,Cercle Brugge 4 0 3 1 4 5 3,<Cercl Brugge> 4 0 3 1 4 5 3,9,2,0.2,28


In [10]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

152353


In [11]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data_full_sentence.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['target_text_length'] = validation_data['target_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,There is evidence of post holes from the woode...,There is evidence of post holes from the woode...,34,2,0.1,196
1,Bank of Montreal said it added 850 million Can...,<Bank of Montreal> said it added 850 million C...,33,1,0.0,203
2,"Once again, we 're trying to get ahold of our ...","Once again, we 're trying to get ahold of our ...",20,0,0.0,92
3,"In New South Wales , private ownership of an i...","In <Ne Sout Wales> , private ownership of an i...",20,3,0.2,122
4,It was a huge fear of mine that it would n't w...,It was a huge fear of mine that it would n't w...,26,0,0.0,106
...,...,...,...,...,...,...
24845,"Previously , WJZ-TV carried the team from thei...","Previously , <WJZ-TV> carried the team from th...",14,2,0.1,95
24846,"Robert Stovall, a veteran New York money manag...","<Robert Stovall>, a veteran <New York> money m...",21,3,0.1,147
24847,Hindenburg refused the powers but agreed to th...,<Hindenburg> refused the powers but agreed to ...,9,1,0.1,60
24848,He finished fourth in the Olympics that year .,He finished fourth in the <Olympics> that year .,8,1,0.1,48


In [12]:
validation_data = validation_data.drop(validation_data[validation_data['target_text_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,There is evidence of post holes from the woode...,There is evidence of post holes from the woode...,34,2,0.1,196
1,Bank of Montreal said it added 850 million Can...,<Bank of Montreal> said it added 850 million C...,33,1,0.0,203
2,"Once again, we 're trying to get ahold of our ...","Once again, we 're trying to get ahold of our ...",20,0,0.0,92
3,"In New South Wales , private ownership of an i...","In <Ne Sout Wales> , private ownership of an i...",20,3,0.2,122
4,It was a huge fear of mine that it would n't w...,It was a huge fear of mine that it would n't w...,26,0,0.0,106
...,...,...,...,...,...,...
24845,"Previously , WJZ-TV carried the team from thei...","Previously , <WJZ-TV> carried the team from th...",14,2,0.1,95
24846,"Robert Stovall, a veteran New York money manag...","<Robert Stovall>, a veteran <New York> money m...",21,3,0.1,147
24847,Hindenburg refused the powers but agreed to th...,<Hindenburg> refused the powers but agreed to ...,9,1,0.1,60
24848,He finished fourth in the Olympics that year .,He finished fourth in the <Olympics> that year .,8,1,0.1,48


In [13]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

24832


In [14]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [15]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [16]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )


training_args = Seq2SeqTrainingArguments(
    "NER_full_sent",
#     evaluation_strategy ='epoch',
    evaluation_strategy ='steps',
    eval_steps = 2380, # Evaluation and Save happens every 3743 steps (steps/(num of epochs*2))
#     logging_steps = 500,
    save_steps = 2380,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    
    gradient_accumulation_steps =2,
    
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 10,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [17]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
#     compute_metrics=compute_metrics,
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 152353
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 47610
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.91 GiB total capacity; 4.48 GiB already allocated; 6.00 MiB free; 4.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print('finished')