In [1]:
# Installing transformers library

!pip install transformers



In [2]:
# Installing PyTorch

!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cac

In [3]:
# Importing necessary libraries

import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

In [4]:
# Reading csv data

df = pd.read_csv('/content/ner.csv')
df.tail()

Unnamed: 0,text,labels
47954,Opposition leader Mir Hossein Mousavi has said...,O O O B-per I-per O O O O O O O O O O O O O O ...
47955,"On Thursday , Iranian state media published a ...",O B-tim O B-gpe O O O O O O O O B-org I-org O ...
47956,"Following Iran 's disputed June 12 elections ,...",O B-geo O O B-tim I-tim O O O O O O O O O O O ...
47957,"Since then , authorities have held public tria...",O O O O O O O O O O O O O O O O O O O O O
47958,The United Nations is praising the use of mili...,O B-org I-org O O O O O O O O O O O O O O B-ti...


In [5]:
# Creating tokenizer intsance

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Importing the required module from PyTorch library for handling datasets.
import torch.utils.data

# This variable controls whether to label all subtokens of a word or just the first one.
label_all_tokens = False

# This function aligns word labels with tokenized input, handling mismatches due to subword tokenization.
def align_label(texts, labels):
    # Tokenizes the text input with specified parameters such as padding and max length.
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    # Retrieves a list where each token's corresponding word index in the original text is stored.
    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    # Loop over each word index to align labels to the tokenized input.
    for word_idx in word_ids:
        # If the current token does not correspond to any word (special tokens), assign label -100.
        if word_idx is None:
            label_ids.append(-100)
        # If it's the first token of a new word, assign the appropriate label.
        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            # For subsequent tokens of a word, assign label -100 unless label_all_tokens is True.
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

# This class defines a custom dataset by extending the Dataset class from PyTorch.
class DataSequence(torch.utils.data.Dataset):
    # Constructor to initialize the dataset object with data from a DataFrame.
    def __init__(self, df):
        # Splits the 'labels' column into separate words and creates a list.
        lb = [i.split() for i in df['labels'].values.tolist()]
        # Extracts text data into a list.
        txt = df['text'].values.tolist()
        # Tokenizes each text and transforms the result into PyTorch tensors.
        self.texts = [tokenizer(str(i), padding='max_length', max_length=512, truncation=True, return_tensors="pt") for i in txt]
        # Aligns each set of text and labels.
        self.labels = [align_label(i, j) for i, j in zip(txt, lb)]

    # Returns the number of items in the dataset.
    def __len__(self):
        return len(self.labels)

    # Retrieves tokenized text data for a specified index.
    def get_batch_data(self, idx):
        return self.texts[idx]

    # Retrieves corresponding labels as a PyTorch tensor for a specified index.
    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    # This method is required by PyTorch and is used to retrieve both data and labels at a specific index.
    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels


In [7]:
# Importing the required libraries.
import numpy as np
import pandas as pd

# Truncating the DataFrame to the first 2000 entries for processing.
df = df[0:2000]

# Extracting and tokenizing labels from the DataFrame.
labels = [i.split() for i in df['labels'].values.tolist()]

# Creating a set to store unique labels.
unique_labels = set()

# Iterating over each list of labels in 'labels' to populate 'unique_labels' set.
for lb in labels:
    # Adding each label to the set if it's not already present.
    [unique_labels.add(i) for i in lb if i not in unique_labels]

# Creating a dictionary to map each unique label to a unique integer ID.
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
# Creating a dictionary to map each unique integer ID back to its corresponding label.
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

# Randomly shuffling the DataFrame and splitting it into training, validation, and testing datasets.
# Here, 80% of data is used for training, 10% for validation, and 10% for testing.
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])


Explanation of the key steps and methods used above:

1. **Data Reduction**: The code first limits the data to the first 2000 rows, simplifying the handling of large datasets for demonstration or initial testing.

2. **Label Processing**:
   - Extract labels from the DataFrame, where each entry under 'labels' is expected to be a space-separated string of labels.
   - Convert each string of labels into a list of individual labels.

3. **Set for Unique Labels**:
   - Create an empty set to ensure all labels stored are unique (sets automatically discard duplicates).
   - Populate the set by iterating over each list of labels and adding each label only if it's not already included.

4. **Label-Integer Mappings**:
   - Generate a dictionary mapping from labels to unique integer IDs (`labels_to_ids`). This is useful for encoding categorical labels into integers which are easier to handle in machine learning models.
   - Generate the inverse mapping (`ids_to_labels`) from integers back to labels, which can be useful for interpretation of model predictions.

5. **Data Splitting**:
   - Shuffle the DataFrame using `sample(frac=1)` to randomize the order of rows, which helps in reducing bias and variance during training.
   - Split the data into three parts using `np.split()`. The split points are calculated to ensure 80% of the data goes to training, 10% to validation, and the remaining 10% to testing. This split ratio is a common practice in machine learning to ensure both robust training and accurate model evaluation.



In [8]:
# Importing necessary modules from the PyTorch library.
import torch

# Importing the BERT model for token classification from the transformers library.
from transformers import BertForTokenClassification

# Definition of the 'BertModel' class that extends the PyTorch Module class.
class BertModel(torch.nn.Module):
    # Constructor function to initialize the BertModel object.
    def __init__(self):
        # Call to the constructor of the superclass (torch.nn.Module) to handle PyTorch-specific initializations.
        super(BertModel, self).__init__()

        # The model is instantiated with a pretrained 'bert-base-cased' model.
        # This model is configured to perform token classification and is customized to predict the number of unique labels.
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    # The forward method defines how the model processes input data and returns output.
    def forward(self, input_id, mask, label):
        # The BERT model processes the input IDs, attention mask, and labels.
        # 'input_ids' are the tokenized inputs,
        # 'attention_mask' is used to avoid processing padding as part of the input,
        # and 'labels' are used when calculating the model loss during training.
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        # The output of the model (logits or loss, depending on whether labels are provided) is returned.
        return output


### Detailed Explanation of Key Concepts and Code Components:

1. **Class Definition**:
   - `class BertModel(torch.nn.Module)`: This line defines a new class `BertModel` that inherits from `torch.nn.Module`. In PyTorch, any custom model should extend `torch.nn.Module` to leverage built-in functionalities like parameter management, gradients, etc.

2. **Constructor `__init__(self)`**:
   - `super(BertModel, self).__init__()`: This initializes the base class (`torch.nn.Module`). It's necessary to correctly set up the network in PyTorch.
   - `BertForTokenClassification.from_pretrained(...)`: This initializes a BERT model pre-trained on the 'bert-base-cased' dataset. The model is further configured for token classification by specifying the number of unique labels it must predict. This step automatically sets up the model with weights that have been pre-trained, and the final layer is adjusted to predict a specific number of classes as determined by `len(unique_labels)`.

3. **Forward Method `forward(self, input_id, mask, label)`**:
   - Defines the data flow through the model, which is crucial for PyTorch models. The `forward` method is where the actual model computation happens.
   - Takes three parameters: `input_id` (the tokenized text inputs), `mask` (the attention mask that indicates to the model which parts of the input are actual data vs padding), and `label` (the true labels for computing the loss during training).
   - `self.bert(...)`: The pre-loaded BERT model processes the inputs and calculates either the loss (if labels are provided) or the output predictions.
   - `return output`: The method returns the model's output, which can be logits during inference or loss during training, depending on the inputs.

This class encapsulates the BERT model for token classification, making it ready to be trained or used for predictions in a PyTorch pipeline, typically involving text data where each token in the input text needs a corresponding label prediction (like in Named Entity Recognition tasks).

In [9]:
# Importing necessary PyTorch modules.
import torch
from torch.utils.data import DataLoader
from torch.optim import SGD
from tqdm import tqdm

# Constants for learning parameters.
LEARNING_RATE = 5e-3
EPOCHS = 15
BATCH_SIZE = 2

# Definition of the training loop function.
def train_loop(model, df_train, df_val):
    # Initializing datasets and dataloaders for both training and validation data.
    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)
    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    # Checking if CUDA (GPU) is available and setting the device accordingly.
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # Initializing the optimizer with model parameters and the specified learning rate.
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    # If CUDA is available, move the model to GPU.
    if use_cuda:
        model = model.cuda()

    # Variables to track the best validation accuracy and loss (for model saving purposes, not shown here).
    best_acc = 0
    best_loss = 1000

    # Main training loop over the specified number of epochs.
    for epoch_num in range(EPOCHS):
        total_acc_train = 0
        total_loss_train = 0

        # Setting the model to training mode (enables dropout, batch normalization etc.)
        model.train()

        # Iterating over batches of training data.
        for train_data, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            # Clearing previous gradients.
            optimizer.zero_grad()

            # Forward pass through the model.
            loss, logits = model(input_id, mask, train_label)

            # Computing training accuracy for the current batch.
            for i in range(logits.shape[0]):
                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            # Backward pass to compute gradients.
            loss.backward()

            # Updating model parameters.
            optimizer.step()

        # Switching the model to evaluation mode (disables dropout, batch normalization etc.)
        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        # Iterating over batches of validation data.
        for val_data, val_label in val_dataloader:
            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            # Forward pass through the model.
            loss, logits = model(input_id, mask, val_label)

            # Computing validation accuracy for the current batch.
            for i in range(logits.shape[0]):
                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()

        # Calculating average accuracy and loss over all validation data.
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        # Printing epoch-wise metrics.
        print(f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {val_loss: .3f} | Val_Accuracy: {val_accuracy: .3f}')

# Initializing the model and starting the training process.
model = BertModel()
train_loop(model, df_train, df_val)


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 800/800 [02:19<00:00,  5.74it/s]


Epochs: 1 | Loss:  0.533 | Accuracy:  0.868 | Val_Loss:  0.397 | Accuracy:  0.887


100%|██████████| 800/800 [02:26<00:00,  5.46it/s]


Epochs: 2 | Loss:  0.380 | Accuracy:  0.895 | Val_Loss:  0.334 | Accuracy:  0.905


100%|██████████| 800/800 [02:26<00:00,  5.44it/s]


Epochs: 3 | Loss:  0.324 | Accuracy:  0.909 | Val_Loss:  0.313 | Accuracy:  0.912


100%|██████████| 800/800 [02:26<00:00,  5.45it/s]


Epochs: 4 | Loss:  0.289 | Accuracy:  0.917 | Val_Loss:  0.311 | Accuracy:  0.914


100%|██████████| 800/800 [02:27<00:00,  5.44it/s]


Epochs: 5 | Loss:  0.256 | Accuracy:  0.926 | Val_Loss:  0.260 | Accuracy:  0.925


100%|██████████| 800/800 [02:28<00:00,  5.40it/s]


Epochs: 6 | Loss:  0.218 | Accuracy:  0.935 | Val_Loss:  0.248 | Accuracy:  0.928


100%|██████████| 800/800 [02:27<00:00,  5.42it/s]


Epochs: 7 | Loss:  0.198 | Accuracy:  0.940 | Val_Loss:  0.242 | Accuracy:  0.930


100%|██████████| 800/800 [02:27<00:00,  5.43it/s]


Epochs: 8 | Loss:  0.181 | Accuracy:  0.944 | Val_Loss:  0.234 | Accuracy:  0.931


100%|██████████| 800/800 [02:27<00:00,  5.43it/s]


Epochs: 9 | Loss:  0.159 | Accuracy:  0.951 | Val_Loss:  0.238 | Accuracy:  0.929


100%|██████████| 800/800 [02:27<00:00,  5.44it/s]


Epochs: 10 | Loss:  0.147 | Accuracy:  0.954 | Val_Loss:  0.234 | Accuracy:  0.931


100%|██████████| 800/800 [02:26<00:00,  5.45it/s]


Epochs: 11 | Loss:  0.131 | Accuracy:  0.959 | Val_Loss:  0.236 | Accuracy:  0.931


100%|██████████| 800/800 [02:27<00:00,  5.44it/s]


Epochs: 12 | Loss:  0.121 | Accuracy:  0.962 | Val_Loss:  0.242 | Accuracy:  0.933


100%|██████████| 800/800 [02:26<00:00,  5.45it/s]


Epochs: 13 | Loss:  0.111 | Accuracy:  0.963 | Val_Loss:  0.243 | Accuracy:  0.938


100%|██████████| 800/800 [02:26<00:00,  5.45it/s]


Epochs: 14 | Loss:  0.096 | Accuracy:  0.969 | Val_Loss:  0.246 | Accuracy:  0.937


100%|██████████| 800/800 [02:26<00:00,  5.44it/s]


Epochs: 15 | Loss:  0.090 | Accuracy:  0.970 | Val_Loss:  0.239 | Accuracy:  0.935


### Key Concepts and Code Components:

1. **Data Loading and Dataloaders**:
   - Datasets are created for both training and validation using the `DataSequence` class defined previously. This class handles data preprocessing like tokenization.
   - Dataloaders manage batching, shuffling (for training), and parallel data loading using multiple worker processes.

2. **Model and Device Setup**:
   - The device is set to use CUDA (GPU) if available, which significantly speeds up training by allowing tensor computations on the GPU.
   - The model is also moved to the appropriate device.

3. **Training Process**:
   - The training loop includes both forward and backward passes. Gradients are computed and used to update the model parameters.
   - Accuracy and loss are tracked throughout training and validation. Training involves computing gradients, whereas validation does not (model is in `eval` mode).

4. **Epoch and Batch Processing**:
   - Each epoch involves going through the entire training dataset once. Performance metrics are calculated at the end of each epoch to monitor progress and make adjustments if needed.

This setup is typical for fine-tuning BERT for tasks like token classification, leveraging transfer learning to adapt a pre-trained model to a more specific task or dataset.

In [10]:
# Evaluate model

def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][test_label[i] != -100]
              label_clean = test_label[i][test_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')


evaluate(model, df_test)

Test Accuracy:  0.945


In [11]:
# Predicting a sentence

def align_word_ids(texts):

    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


def evaluate_one_text(model, sentence):


    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    text = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_label = [ids_to_labels[i] for i in predictions]
    print(sentence)
    print(prediction_label)

evaluate_one_text(model, 'Bill Gates is the founder of Microsoft')

Bill Gates is the founder of Microsoft
['B-per', 'I-per', 'O', 'O', 'O', 'O', 'B-org']
