In [2]:
%pip install torch==1.11.0+cu113 torchdata==0.3.0 torchtext==0.12.0 -f https://download.pytorch.org/whl/cu113/torch_stable.html
%pip install spacy tqdm
!python -m spacy download en_core_web_sm

Looking in links: https://download.pytorch.org/whl/cu113/torch_stable.html
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  1.11.0+cu113
torchtext Version:  0.12.0
Using GPU.


In [4]:
!pip install datasets



In [5]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
train = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]


train_tokens = train["tokens"]
train_labels = train["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]


In [7]:
print(train_tokens[:2])
print(train_labels[:2])
print(len(train_tokens))
print(len(train_labels))

[['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(', 'GYPES', ')', 'was', 'developed', '.'], ['The', 'following', 'physiological', 'traits', 'were', 'measured', ':', 'stomatal', 'conductance', '(', 'gs', ',', 'mol', 'H2O', 'm-2', 's-1', ')', ',', 'transpiration', 'rate', '(', 'E', ',', 'mmol', 'H2O', 'm-2', 's-1', ')', ',', 'net', 'photosynthetic', 'rate', '(', 'PN', ',', 'μmol', 'm-2', 's-1', ')', 'and', 'intercellular', 'CO2', 'concentration', 'CO2', '(', 'Ci', ',', 'μmol', 'm-2', 's-1', ')', '.']]
[['B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O'], ['B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'B-AC', 'B-O', 'B-AC

In [8]:
from torchtext.data.utils import get_tokenizer

class SpacyTokenizer(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

    def forward(self, input):
        if isinstance(input, list):
            tokens = []
            for text in input:
                tokens.append(self.tokenizer(text))
            return tokens
        elif isinstance(input, str):
            return self.tokenizer(input)
        raise ValueError(f"Type {type(input)} is not supported.")

In [9]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator, vocab

from collections import OrderedDict
MAX_VOCAB_SIZE = 25_000

tokenizer = SpacyTokenizer()

def _process_texts_for_vocab(data):
	for line in data:
		yield tokenizer(" ".join(line))



text_vocab = build_vocab_from_iterator(_process_texts_for_vocab(train_tokens), specials=('<unk>', '<pad>'), max_tokens=MAX_VOCAB_SIZE)
label_vocab = vocab(OrderedDict([("B-O", 1), ("B-AC", 2), ("B-LF", 3), ("I-LF", 4)]))

text_vocab.set_default_index(text_vocab["<unk>"])

In [10]:
label_vocab.get_stoi()

{'I-LF': 3, 'B-LF': 2, 'B-AC': 1, 'B-O': 0}

In [11]:
print(f"Unique tokens in text vocabulary: {len(text_vocab)}")
print(f"Unique tokens in label vocabulary: {len(label_vocab)}")

Unique tokens in text vocabulary: 9121
Unique tokens in label vocabulary: 4


In [12]:
text_vocab.get_itos()[10:30]

[';',
 'in',
 'to',
 ']',
 '[',
 'a',
 'with',
 'for',
 'were',
 'was',
 ':',
 'by',
 'is',
 '%',
 'that',
 'as',
 'The',
 'from',
 '1',
 'are']

In [13]:
import torch
from torchtext.transforms import ToTensor, VocabTransform, Truncate
import torchtext.transforms as T
from torch.nn.utils.rnn import pad_sequence
from torch import nn

In [14]:
def transform_dataset(tokens, labels):
  flattened_tokens = [item for sublist in tokens for item in sublist]
  flattened_labels = [item for sublist in labels for item in sublist]
  return flattened_tokens, flattened_labels


In [15]:
train_tokens, train_labels = transform_dataset(train_tokens, train_labels)
valid_tokens, valid_labels = transform_dataset(valid_tokens, valid_labels)
test_tokens, test_labels = transform_dataset(test_tokens, test_labels)

len(train_tokens)

40000

In [16]:
class ToLengths(torch.nn.Module):
    def forward(self, input):
        if isinstance(input[0], list):
            lengths = []
            for text in input:
                lengths.append(len(text))
            return lengths
        elif isinstance(input, list):
            return len(input)
        elif isinstance(input, str):
          return len(input)
        raise ValueError(f"Type {type(input)} is not supported.")

lengths_transform = T.Sequential(
    SpacyTokenizer(),
    ToLengths(),
    T.ToTensor(),
)



In [17]:
import torchtext.transforms as T

text_transform = T.Sequential(
    #SpacyTokenizer(),  # Tokenize
    T.VocabTransform(text_vocab),  # Conver to vocab IDs
    T.ToTensor(padding_value=text_vocab["<pad>"]),  # Convert to tensor and pad
)

label_transform = T.Sequential(
    T.LabelToIndex(label_vocab.get_itos()),  # Convert to integer
    T.ToTensor(),  # Convert to tensor

)



In [34]:
sample_label, sample_text = train_labels[:10], train_tokens[:10]

print(f"Text before any processing: {sample_text}")
print(f"Label before any processing: {sample_label}\n")

#Text processing pipeline

vocab_transform = T.VocabTransform(text_vocab)
sample_text = vocab_transform(sample_text)

print(f"Text after Vocab Transform: {sample_text}\n")

tensor_transform = T.ToTensor(padding_value=text_vocab["<pad>"])
sample_text = tensor_transform(sample_text)
print(f"Text after Tensor Transform: {sample_text}\n")
print(type(sample_text))
# Label Processing Pipeline
print(f"Label after label transform: {label_transform(sample_label)}\n")

# # Length Processing Pipeline
print(f"Text after length transform: {lengths_transform(train_tokens[:10])}")

Text before any processing: ['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(']
Label before any processing: ['B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O']

Text after Vocab Transform: [254, 60, 8411, 5, 4886, 6323, 5689, 4680, 1621, 3]

Text after Tensor Transform: tensor([ 254,   60, 8411,    5, 4886, 6323, 5689, 4680, 1621,    3])

<class 'torch.Tensor'>
Label after label transform: tensor([0, 0, 0, 0, 2, 3, 3, 3, 3, 0])

Text after length transform: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [143]:
unsqueezed= sample_text.unsqueeze(1)
unsqueezed

tensor([[ 254],
        [  60],
        [8411],
        [   5],
        [4886],
        [6323],
        [5689],
        [4680],
        [1621],
        [   3]])

In [35]:
sample_labels, sample_texts = train_labels[:6], train_tokens[:2]
print(sample_labels)
print(sample_texts)
#vocab convert the texts
vocab_transform = T.VocabTransform(text_vocab)
sample_text = vocab_transform(sample_texts)
print(sample_text)
processed_sample_texts = text_transform(list(sample_texts))
print(processed_sample_texts)

# diff = abs(lengths[0] - lengths[1]) + 5|

# print(f"Padding vocabulary index: {text_vocab['<pad>']}")

# print("Respective text lengths after tokenization: ", lengths)
# print("Tensor shape after text processing: ", processed_sample_texts.shape)
# print(f"Last {diff} characters of text 0 after processing:\n", processed_sample_texts[0][-diff:])
# print(f"Last {diff} characters of text 1 after processing:\n", processed_sample_texts[1][-diff:])

['B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF']
['For', 'this']
[254, 60]
tensor([254,  60])


In [36]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64

def collate_batch(batch):
    labels, texts = zip(*batch)
    lengths = torch.ones(BATCH_SIZE)
    texts = text_transform(list(texts))
    labels = label_transform(list(labels))
    labels = labels.long()
    texts = texts.unsqueeze(1)

    return labels.float().to(DEVICE), texts.to(DEVICE), lengths.cpu()

def _get_dataloader(data):
    return DataLoader(data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch, drop_last = True)

train_data = list(zip(train_labels, train_tokens))
valid_data = list(zip(valid_labels, valid_tokens))
test_data = list(zip(test_labels, test_tokens))

train_dataloader = _get_dataloader(train_data)
valid_dataloader = _get_dataloader(valid_data)
test_dataloader = _get_dataloader(test_data)

In [37]:
for batch in train_dataloader:
    labels, texts, lengths = batch
    print(texts.shape)  # Should be (BATCH_SIZE, sequence_length) or similar
    print(labels.shape)  # Should be (BATCH_SIZE, )
    break  # Remove this to loop through all batches

torch.Size([64, 1])
torch.Size([64])


In [38]:
examplelabs = train_labels[:20]
translabs = label_transform(examplelabs)
exampletokens= train_tokens[:20]
transtokens = text_transform(exampletokens)
print(translabs)
print(transtokens)

tensor([0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([ 254,   60, 8411,    5, 4886, 6323, 5689, 4680, 1621,    3, 4855,    4,
          19,  312,    6,   26,  235, 3255, 8903,   18])


In [39]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, texts, lengths):
        embedded = self.embedding(texts)                          # VV note that lengths need to be on the CPU
        embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)

        output, hidden = self.rnn(embedded)

        return self.fc(hidden.squeeze(0))

In [40]:
INPUT_DIM = len(text_vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 4

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [41]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,004,776 trainable parameters


In [42]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr = 2e-5)
criterion = nn.CrossEntropyLoss()
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [43]:
import torch

def categorical_accuracy(preds, y):

    # Get the predictions for each class using softmax
    softmax_preds = torch.softmax(preds, dim=1)

    # Get the index of the highest probability class for each example
    top_pred = torch.argmax(softmax_preds, dim=1)

    # Check how many predictions match the true labels
    correct = (top_pred == y).float()  # convert into float for division

    # Calculate accuracy
    acc = correct.sum() / len(correct)
    return acc


In [54]:
from tqdm import tqdm

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in tqdm(iterator, desc="\tTraining"):
        optimizer.zero_grad()
        labels, texts, lengths = batch
        labels = labels.to(DEVICE).long()
        predictions = model(texts, lengths)
        predictions = predictions.to(DEVICE).float()
        loss = criterion(predictions, labels)
        acc = categorical_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    print('finished batches')
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [61]:
from sklearn.metrics import f1_score

In [62]:
def calc_f1score(labels, predictions):
  y_true = labels.numpy()
  y_pred = labels.numpy()
  score = f1_score(y_true, y_pred, average = 'weighted')
  return score

In [58]:
from tqdm import tqdm


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in tqdm(iterator, desc="\tEvaluation"):
            labels, texts, lengths = batch
            labels = labels.to(DEVICE).long()
            predictions = model(texts, lengths).squeeze(1)
            predictions = predictions.to(DEVICE).float()
            loss = criterion(predictions, labels)
            acc = categorical_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            batch_predictions = predictions.argmax(dim=1)

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [56]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [59]:
N_EPOCHS = 5

best_valid_loss = float('inf')
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'} for training.")

for epoch in range(N_EPOCHS):
    print(f'Epoch: {epoch+1:02}')
    start_time = time.time()

    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

    valid_loss, valid_acc = evaluate(model, valid_dataloader, criterion)
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

Using GPU for training.
Epoch: 01


	Training: 100%|██████████| 625/625 [00:02<00:00, 306.59it/s]


finished batches
	Train Loss: 0.620 | Train Acc: 82.00%


	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 892.16it/s]


	 Val. Loss: 0.556 |  Val. Acc: 85.06%
Epoch: 02


	Training: 100%|██████████| 625/625 [00:01<00:00, 349.41it/s]


finished batches
	Train Loss: 0.601 | Train Acc: 82.33%


	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 881.98it/s]


	 Val. Loss: 0.539 |  Val. Acc: 85.26%
Epoch: 03


	Training: 100%|██████████| 625/625 [00:01<00:00, 339.81it/s]


finished batches
	Train Loss: 0.594 | Train Acc: 82.39%


	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 900.69it/s]


	 Val. Loss: 0.531 |  Val. Acc: 85.26%
Epoch: 04


	Training: 100%|██████████| 625/625 [00:01<00:00, 341.62it/s]


finished batches
	Train Loss: 0.590 | Train Acc: 82.40%


	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 830.19it/s]


	 Val. Loss: 0.530 |  Val. Acc: 85.24%
Epoch: 05


	Training: 100%|██████████| 625/625 [00:01<00:00, 312.69it/s]


finished batches
	Train Loss: 0.587 | Train Acc: 82.41%


	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 668.54it/s]

	 Val. Loss: 0.532 |  Val. Acc: 85.20%





In [60]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_dataloader, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

	Evaluation: 100%|██████████| 78/78 [00:00<00:00, 831.52it/s]

Test Loss: 0.520 | Test Acc: 85.76%



