In [1]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
# os.environ["CUDA_VISIBLE_DEVICES"] = "2,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


wandb.login()
%env WANDB_PROJECT= NER_full_sentence

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

[34m[1mwandb[0m: Currently logged in as: [33mhodz199[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=NER_full_sentence


In [2]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
#     _, _, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [4]:
max_source_length = 512
max_target_length = 512

In [5]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [6]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data_full_sentence.csv')
# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['target_text_length'] = training_data['target_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,LOS ANGELES 67 60 .528 2,*s* LOS ANGELES *e* 67 60 .528 2,6,2,0.3,32
1,The Japanese retort that the first round was t...,The *s* Japanese *e* retort that the first rou...,13,1,0.1,83
2,More than 250 ethnic groups have been identifi...,More than 250 ethnic groups have been identifi...,16,1,0.1,100
3,"When cAMP binds , the domain dissociates and e...","When cAMP binds , the domain dissociates and e...",23,3,0.1,152
4,He followed that performance with a record 148...,He followed that performance with a record 148...,24,1,0.0,146
...,...,...,...,...,...,...
152438,The annual Grand National horse race takes pla...,The annual *s* Grand National *e* horse race t...,11,4,0.4,88
152439,"Miami-Dade County, for one, is heavily Democra...","*s* Miami-Dade County *e*, for one, is heavily...",31,4,0.1,222
152440,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,108
152441,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge *e* 4 0 3 1 4 5 3,9,2,0.2,35


In [7]:
training_data[training_data['input_text'].str.contains('Cercle Brugge')]

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
54179,Cup Winners ' Cup x-Cercle Brugge ( Belgium ) ...,*s* Cup Winners ' Cup *e* *s* x-Cercle Brugge ...,10,9,0.9,111
143050,Cercle Brugge 2 Mouscron 2,*s* Cercle Brugge *e* 2 *s* Mouscron *e* 2,5,3,0.6,42
152441,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge *e* 4 0 3 1 4 5 3,9,2,0.2,35


In [8]:
len(training_data[training_data['target_text_length']> 512])

209

In [9]:
len(training_data[training_data['target_text_length']> 512])*100/len(training_data)

0.1371004244209311

In [10]:
training_data = training_data.drop(training_data[training_data['target_text_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,LOS ANGELES 67 60 .528 2,*s* LOS ANGELES *e* 67 60 .528 2,6,2,0.3,32
1,The Japanese retort that the first round was t...,The *s* Japanese *e* retort that the first rou...,13,1,0.1,83
2,More than 250 ethnic groups have been identifi...,More than 250 ethnic groups have been identifi...,16,1,0.1,100
3,"When cAMP binds , the domain dissociates and e...","When cAMP binds , the domain dissociates and e...",23,3,0.1,152
4,He followed that performance with a record 148...,He followed that performance with a record 148...,24,1,0.0,146
...,...,...,...,...,...,...
152438,The annual Grand National horse race takes pla...,The annual *s* Grand National *e* horse race t...,11,4,0.4,88
152439,"Miami-Dade County, for one, is heavily Democra...","*s* Miami-Dade County *e*, for one, is heavily...",31,4,0.1,222
152440,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,108
152441,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge *e* 4 0 3 1 4 5 3,9,2,0.2,35


In [11]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

152234


In [12]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data_full_sentence.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['target_text_length'] = validation_data['target_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,%mm.,%mm.,1,0,0.0,4
1,Welsh rugby union matches on Saturday :,*s* Welsh *e* rugby union matches on Saturday :,6,1,0.2,47
2,Whitlam 's speechwriter Fraser rejected this i...,*s* Whitlam *e* 's speechwriter *s* Fraser *e*...,22,2,0.1,158
3,"In November 1986 , he recorded three hat trick...","In November 1986 , he recorded three hat trick...",22,6,0.3,128
4,But ever since the Supreme Court's Webster vs....,But ever since *s* the Supreme Court's *e* *s*...,40,3,0.1,278
...,...,...,...,...,...,...
24875,"Previously , WJZ-TV carried the team from thei...","Previously , *s* WJZ-TV *e* carried the team f...",14,2,0.1,107
24876,"But when Comdek's product came out, Michael Ku...","But when *s* Comdek *e*'s product came out, *s...",42,3,0.1,249
24877,Hindenburg refused the powers but agreed to th...,*s* Hindenburg *e* refused the powers but agre...,9,1,0.1,66
24878,He finished fourth in the Olympics that year .,He finished fourth in the *s* Olympics *e* tha...,8,1,0.1,54


In [13]:
validation_data = validation_data.drop(validation_data[validation_data['target_text_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,%mm.,%mm.,1,0,0.0,4
1,Welsh rugby union matches on Saturday :,*s* Welsh *e* rugby union matches on Saturday :,6,1,0.2,47
2,Whitlam 's speechwriter Fraser rejected this i...,*s* Whitlam *e* 's speechwriter *s* Fraser *e*...,22,2,0.1,158
3,"In November 1986 , he recorded three hat trick...","In November 1986 , he recorded three hat trick...",22,6,0.3,128
4,But ever since the Supreme Court's Webster vs....,But ever since *s* the Supreme Court's *e* *s*...,40,3,0.1,278
...,...,...,...,...,...,...
24875,"Previously , WJZ-TV carried the team from thei...","Previously , *s* WJZ-TV *e* carried the team f...",14,2,0.1,107
24876,"But when Comdek's product came out, Michael Ku...","But when *s* Comdek *e*'s product came out, *s...",42,3,0.1,249
24877,Hindenburg refused the powers but agreed to th...,*s* Hindenburg *e* refused the powers but agre...,9,1,0.1,66
24878,He finished fourth in the Olympics that year .,He finished fourth in the *s* Olympics *e* tha...,8,1,0.1,54


In [14]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

24836


In [15]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [16]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [17]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )


training_args = Seq2SeqTrainingArguments(
    "NER_full_sent_add_space_before_bracket_diff_seperators_v3",
#     evaluation_strategy ='epoch',
    evaluation_strategy ='steps',
    eval_steps = 2380, # Evaluation and Save happens every 3743 steps (steps/(num of epochs*2))
#     logging_steps = 500,
    save_steps = 2380,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    
    gradient_accumulation_steps =2,
    
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 10,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [18]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
#     compute_metrics=compute_metrics,
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 152234
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 47570
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss


In [None]:
print('finished')