In [2]:
# !pip install wandb
# !pip install transformers
# !pip install sentencepiece

import pandas as pd
import numpy as np
import re
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import math
import torch.optim as optim
import os
import wandb

# wandb.login()

os.environ["WANDB_DISABLED"] = "true"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

Collecting wandb
  Downloading wandb-0.12.17-py2.py3-none-any.whl (1.8 MB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.12-py2.py3-none-any.whl (145 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp39-cp39-win_amd64.whl (10 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting docker-pycreds>=0.4.0
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting promise<3,>=2.0
  Using cached promise-2.3.tar.gz (19 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting gitdb<5,>=4.0.1
  Using cached gitdb-4.0.9-py3-none-any.whl (63 kB)
Collecting smmap<6,>=3.0.1
  Using cached smmap-5.0.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: promise, pathtools
  Building wheel for promise (setup.py): started
  Building wheel for promise (setup.py): finished with status 'done'
  Creat

In [3]:
# Create torch dataset
# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [4]:
max_source_length = 512
max_target_length = 128

In [5]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration 

tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to(device)

In [7]:
# training_sample = training_data.sample(frac=0.4, random_state=1)

training_data = pd.read_csv('./1-NER_Data/1-csv_format/train/training_data.csv')
training_data

Unnamed: 0,input_text,target_text
0,EU rejects German call to boycott British lamb .,"*EU*,*German*,*British*"
1,Peter Blackburn,*Peter Blackburn*
2,BRUSSELS 1996-08-22,*BRUSSELS*
3,The European Commission said on Thursday it di...,"*European Commission*,*German*,*British*"
4,Germany 's representative to the European Unio...,"*Germany*,*European Union*,*Werner Zwingmann*,..."
...,...,...
159090,""" @Mackenzie_71 : Super Bowl Sunday """,**
159091,Watch Barrow - Boston United Live December 20 ...,"*Barrow*,*Boston United*"
159092,"On January 22nd , from 11- 1.30 , TVPS will be...",*TVPS*
159093,"Conditioning plan all made up for tomorrow , f...",**


In [10]:
X_train_tokenized = tokenizer(['ner: ' + sequence for sequence in training_data["input_text"]], 
                              padding=True, 
                              truncation=True, 
                              max_length=max_source_length)

y_train_tokenized = tokenizer(list(training_data["target_text"]), 
                              padding=True, 
                              truncation=True, 
                              max_length=max_target_length)

print(len(training_data))
# print(len(training_sample))

159095


In [11]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    "test_trainer",
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    learning_rate = 1e-3,
    adam_epsilon = 1e-8,
    num_train_epochs = 5,
    report_to="wandb",
    save_total_limit = 3 # Only last 3 models are saved. Older ones are deleted.
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args, 
    train_dataset= train_dataset)

In [None]:
trainer.train()