In [6]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,3" 

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


wandb.login()
%env WANDB_PROJECT= NER_full_sentence

# os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

env: WANDB_PROJECT=NER_full_sentence


In [7]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [8]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
#     _, _, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [9]:
max_source_length = 512
max_target_length = 512

In [10]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 
from transformers import EarlyStoppingCallback

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [11]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data_full_sentence_with_pos.csv')
# training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/lower_normal_training.csv')

training_data = training_data.sample(frac=1, random_state=1).reset_index(drop=True)
training_data['target_text_length'] = training_data['target_text'].apply(lambda x: len(x))
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,"Often the Lao will refer to themselves as "" lu...",Often the *s* Lao | MISC *e* will refer to the...,22,1,0.0,148
1,Discrepancies go unexplained in `` Confidentia...,Discrepancies go unexplained in `` *s* Confide...,57,3,0.1,395
2,Since 1987 an illustration of the castle has b...,Since 1987 an illustration of the castle has b...,24,4,0.2,146
3,The Cibola National Forest conserves large nat...,The *s* Cibola National Forest | LOC *e* conse...,17,3,0.2,132
4,At the 1984 Summer Olympics in Los Angeles he ...,At the *s* 1984 Summer Olympics | MISC *e* in ...,19,5,0.3,127
...,...,...,...,...,...,...
152545,The annual Grand National horse race takes pla...,The annual *s* Grand National | MISC *e* horse...,11,4,0.4,101
152546,[bingladen] Poster : bingladen-LRB-sssssr-RRB-...,[bingladen] Poster : bingladen-LRB-sssssr-RRB-...,7,0,0.0,65
152547,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,122
152548,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge | ORG *e* 4 0 3 1 4 5 3,9,2,0.2,41


In [12]:
training_data[training_data['input_text'].str.contains('Cercle Brugge')]

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
54325,Cup Winners ' Cup x-Cercle Brugge ( Belgium ) ...,*s* Cup Winners ' Cup | MISC *e* *s* x-Cercle ...,10,9,0.9,142
143147,Cercle Brugge 2 Mouscron 2,*s* Cercle Brugge | ORG *e* 2 *s* Mouscron | O...,5,3,0.6,54
152548,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge | ORG *e* 4 0 3 1 4 5 3,9,2,0.2,41


In [13]:
len(training_data[training_data['target_text_length']> 512])

300

In [14]:
len(training_data[training_data['target_text_length']> 512])*100/len(training_data)

0.19665683382497542

In [15]:
training_data = training_data.drop(training_data[training_data['target_text_length']> 512].index)
training_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,"Often the Lao will refer to themselves as "" lu...",Often the *s* Lao | MISC *e* will refer to the...,22,1,0.0,148
1,Discrepancies go unexplained in `` Confidentia...,Discrepancies go unexplained in `` *s* Confide...,57,3,0.1,395
2,Since 1987 an illustration of the castle has b...,Since 1987 an illustration of the castle has b...,24,4,0.2,146
3,The Cibola National Forest conserves large nat...,The *s* Cibola National Forest | LOC *e* conse...,17,3,0.2,132
4,At the 1984 Summer Olympics in Los Angeles he ...,At the *s* 1984 Summer Olympics | MISC *e* in ...,19,5,0.3,127
...,...,...,...,...,...,...
152545,The annual Grand National horse race takes pla...,The annual *s* Grand National | MISC *e* horse...,11,4,0.4,101
152546,[bingladen] Poster : bingladen-LRB-sssssr-RRB-...,[bingladen] Poster : bingladen-LRB-sssssr-RRB-...,7,0,0.0,65
152547,Other disease-causing bacteria in this family ...,Other disease-causing bacteria in this family ...,10,2,0.2,122
152548,Cercle Brugge 4 0 3 1 4 5 3,*s* Cercle Brugge | ORG *e* 4 0 3 1 4 5 3,9,2,0.2,41


In [16]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

152250


In [17]:
validation_data = pd.read_csv('./1-NER_Data/1-csv_format/val/val_data_full_sentence_with_pos.csv')
validation_data = validation_data.sample(frac=1, random_state=1).reset_index(drop=True)
validation_data['target_text_length'] = validation_data['target_text'].apply(lambda x: len(x))
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,In about 1837 the orangery was replaced by a l...,In about 1837 the orangery was replaced by a l...,15,2,0.1,105
1,It has been also noted for its visibly irrever...,It has been also noted for its visibly irrever...,26,1,0.0,172
2,"For example , Nikolai Durov has introduced com...","For example , *s* Nikolai Durov | PER *e* has ...",20,2,0.1,160
3,"Its wildlife includes Persian leopards , brown...",Its wildlife includes *s* Persian | MISC *e* l...,15,1,0.1,128
4,I would view it as a net positive.,I would view it as a net positive.,8,0,0.0,34
...,...,...,...,...,...,...
24822,"In March 2012 , he was fined for parking illeg...","In March 2012 , he was fined for parking illeg...",14,1,0.1,94
24823,This is a potential problem for the developmen...,This is a potential problem for the developmen...,14,1,0.1,104
24824,"None of them would , so he became another pres...","None of them would , so he became another pres...",21,1,0.0,163
24825,The three American jumpers had been easily the...,The three *s* American | MISC *e* jumpers had ...,13,1,0.1,95


In [18]:
validation_data = validation_data.drop(validation_data[validation_data['target_text_length']> 512].index)
validation_data

Unnamed: 0,input_text,target_text,word_count,NE_count,%_NE_in_sentence,target_text_length
0,In about 1837 the orangery was replaced by a l...,In about 1837 the orangery was replaced by a l...,15,2,0.1,105
1,It has been also noted for its visibly irrever...,It has been also noted for its visibly irrever...,26,1,0.0,172
2,"For example , Nikolai Durov has introduced com...","For example , *s* Nikolai Durov | PER *e* has ...",20,2,0.1,160
3,"Its wildlife includes Persian leopards , brown...",Its wildlife includes *s* Persian | MISC *e* l...,15,1,0.1,128
4,I would view it as a net positive.,I would view it as a net positive.,8,0,0.0,34
...,...,...,...,...,...,...
24822,"In March 2012 , he was fined for parking illeg...","In March 2012 , he was fined for parking illeg...",14,1,0.1,94
24823,This is a potential problem for the developmen...,This is a potential problem for the developmen...,14,1,0.1,104
24824,"None of them would , so he became another pres...","None of them would , so he became another pres...",21,1,0.0,163
24825,The three American jumpers had been easily the...,The three *s* American | MISC *e* jumpers had ...,13,1,0.1,95


In [19]:
X_val_tokenized = tokenizer(['ner: ' + sequence for sequence in validation_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_val_tokenized = tokenizer(list(validation_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(validation_data))
# print(len(training_sample))

24779


In [20]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [21]:
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

In [23]:
from transformers import Seq2SeqTrainingArguments

# training_args = Seq2SeqTrainingArguments(
#     "NER_lower",
#     evaluation_strategy ='steps',
#     eval_steps = 500, # Evaluation and Save happens every 500 steps
#     save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
#     learning_rate = 1e-3,
#     adam_epsilon = 1e-8,
#     num_train_epochs = 6,
#     report_to="wandb",
# #     metric_for_best_model = 'f1',
#     load_best_model_at_end=True
# )


training_args = Seq2SeqTrainingArguments(
    "NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3",
#     evaluation_strategy ='epoch',
    evaluation_strategy ='steps',
    eval_steps = 2380, # Evaluation and Save happens every 3743 steps (steps/(num of epochs*2))
    logging_steps = 500,
    save_steps = 2380,
    save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    
    gradient_accumulation_steps =2,
    
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 10,
    report_to="wandb",
#     metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

In [24]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
#     compute_metrics=compute_metrics,
    train_dataset= train_dataset,
    eval_dataset = val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

In [25]:
trainer.train()

***** Running training *****
  Num examples = 152250
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 47580
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Step,Training Loss,Validation Loss
2380,0.0061,0.00433
4760,0.0052,0.003689
7140,0.0041,0.003873


***** Running Evaluation *****
  Num examples = 24779
  Batch size = 16
Saving model checkpoint to NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-2380
Configuration saved in NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-2380/config.json
Model weights saved in NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-2380/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24779
  Batch size = 16
Saving model checkpoint to NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-4760
Configuration saved in NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-4760/config.json
Model weights saved in NER_full_sent_with_pos_add_space_before_bracket_diff_seperators_v3/checkpoint-4760/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 24779
  Batch size = 16
Saving model checkpoint to NER_full_sent_with_pos_add_space_before_bracket_diff_

RuntimeError: [enforce fail at inline_container.cc:300] . unexpected pos 1058835008 vs 1058834896

In [None]:
print('finished')