# Data Pre-Processing techniques


## Install Dependencies


In [None]:
%pip install datasets
%pip install transformers
%pip install spacy


BELOW TAKES FOREVER!

In [None]:
%pip install spacy-transformers
%pip install transformers[torch]
%pip install seqeval

In [3]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


In [4]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

## Load BERT Model from transformers library

In [62]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]



Function for forming ngrams out of tokens and labels


In [64]:
def n_grams(tokens, labels, n):
  all_ngrams = []
  all_labels = []
  for i in range(0,len(tokens)):
    for j in range(len(tokens[i]) - n +1):
      all_ngrams.append(tokens[i][j:j+n])
      all_labels.append(labels[i][j:j+n])
  return all_ngrams, all_labels



Form n-grams for the different dataset splits

In [65]:
training_ngrams, training_ner = n_grams(training_tokens, training_labels, 4)
valid_ngrams, valid_ner = n_grams(valid_tokens, valid_labels, 4)
test_ngrams, test_ner = n_grams(test_tokens, test_labels, 4)
len(training_ngrams), len(valid_ngrams), len(test_ngrams)



(36789, 4622, 4542)

# Need to convert labels to class indexes so that the model can understand them

In [66]:
def conv_label_indexes(training, valid, test ):
	label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

	label_list = []
	for sample in training:
		label_list.append([label_encoding[tag] for tag in sample])

	val_label_list = []
	for sample in valid:
		val_label_list.append([label_encoding[tag] for tag in sample])

	test_label_list = []
	for sample in test:
		test_label_list.append([label_encoding[tag] for tag in sample])
	return label_list, val_label_list, test_label_list





In [67]:
label_list, val_label_list, test_label_list = conv_label_indexes(training_ner, valid_ner, test_ner)

In [68]:
def tokenize_and_align_labels(ngrams, list_name):
    tokenized_inputs = tokenizer(ngrams, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [69]:
tokenized_datasets = tokenize_and_align_labels(training_ngrams, label_list)
tokenized_val_datasets = tokenize_and_align_labels(valid_ngrams, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_ngrams, test_label_list)
# print(tokenized_datasets)

In [70]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences).
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [71]:
tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

In [72]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [73]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [74]:
# Prepare the test data for evaluation in the same format as the training data
def calculate_results(trainer, data):
	predictions, labels, _ = trainer.predict(data)
	predictions = np.argmax(predictions, axis=2)

	# Remove the predictions for the [CLS] and [SEP] tokens
	true_predictions = [
			[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]
	true_labels = [
			[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
			for prediction, label in zip(predictions, labels)
	]

	# Compute multiple metrics on the test restuls
	results = metric.compute(predictions=true_predictions, references=true_labels)
	return results

In [77]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 32
learning_rate = 2e-5

args = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [78]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=6900, training_loss=0.12059809021327807, metrics={'train_runtime': 771.1839, 'train_samples_per_second': 286.227, 'train_steps_per_second': 8.947, 'total_flos': 1506753654197352.0, 'train_loss': 0.12059809021327807, 'epoch': 6.0})

![image.png](attachment:image.png)

In [82]:
n_gram_results = calculate_results(trainer, tokenised_test)
n_gram_results



{'0, 0, 0, 0]': {'precision': 0.7668170229456757,
  'recall': 0.7807507987220448,
  'f1': 0.7737211833382804,
  'number': 5008},
 '0, 0, 0, 2]': {'precision': 0.6972222222222222,
  'recall': 0.7707267144319345,
  'f1': 0.7321341759844433,
  'number': 977},
 '0, 0, 2, 3]': {'precision': 0.44101633393829404,
  'recall': 0.4576271186440678,
  'f1': 0.4491682070240296,
  'number': 531},
 '0, 2, 3, 3]': {'precision': 0.48233695652173914,
  'recall': 0.5652866242038217,
  'f1': 0.5205278592375367,
  'number': 628},
 'overall_precision': 0.7046611304580767,
 'overall_recall': 0.7364221724524076,
 'overall_f1': 0.7201916495550993,
 'overall_accuracy': 0.8671331549701584}

# Results of 2-gram converted dataset
From the above results we can see that the model performs decently, but not better than just using BERTs regular tokenizer on the regular dataset.
Additionally, by applying n-grams to the dataset, we increase the number of tokens in our dataset by multiple factors. This makes the training process much more computationally expensive. So worse performance for more computation is not a good idea

![alt text](test1.png)

# Analysis of use of n-grams for BERT model
Above we can see the results of using n-grams on the dataset before tokenizing and training the BERT model.
BERT uses the encoder architecture meaning that it looks at tokens in both directions (good at capturing context). For this reason, n-grams introduce a lot of unnecessary / redundant and repeated data.\
It is also likely that the n-grams introduced a lot of repeated data during BERT tokenization process.
The main observation to make is the amount of tokens we end up with as a result of n-grams (much larger than without). This makes model training expensive without actually providing any improvement.



# Data Pre-Processing techniques #2: Removing stopwords
The 2nd approach I am going to take is to remove stopwords from the dataset (both tokens and their associated labels)

In [43]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [44]:
import nltk
from nltk.corpus import stopwords

def remove_stopwords(tokens, labels):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = []
    filtered_labels = []

    for sublist_tokens, sublist_labels in zip(tokens, labels):
        filtered_tokens_sublist = []
        filtered_labels_sublist = []

        for token, label in zip(sublist_tokens, sublist_labels):
            if token.lower() not in stop_words:
                filtered_tokens_sublist.append(token)
                filtered_labels_sublist.append(label)

        filtered_tokens.append(filtered_tokens_sublist)
        filtered_labels.append(filtered_labels_sublist)

    return filtered_tokens, filtered_labels


In [45]:
tokencount = sum(len(subarray) for subarray in training_tokens)
labelcount = sum(len(subarray) for subarray in training_labels)
tokencount, labelcount


(40000, 40000)

In [46]:
filtered_train_tokens, filtered_train_labels = remove_stopwords(training_tokens, training_labels)
filtered_valid_tokens, filtered_valid_labels = remove_stopwords(valid_tokens, valid_labels)

print(sum(len(subarray)for subarray in filtered_train_tokens))
print(sum(len(subarray) for subarray in filtered_train_labels))


print(sum(len(subarray) for subarray in filtered_valid_tokens))
print(sum(len(subarray) for subarray in filtered_valid_labels))

31497
31497
3834
3834


In [47]:

label_list, val_label_list, test_label_list = conv_label_indexes(filtered_train_labels, filtered_valid_labels, test_labels)

tokenized_datasets = tokenize_and_align_labels(filtered_train_tokens, label_list)

tokenized_val_datasets = tokenize_and_align_labels(filtered_valid_tokens, filtered_valid_labels)
tokenized_test_datasets = tokenize_and_align_labels(test_tokens, test_label_list)

tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)



In [48]:
print(label_list[5])
print(filtered_train_tokens[5])


[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]
['importance', 'formation', 'PFN1', '-', 'actin', 'complexes', 'regulation', 'PKC', 'corroborated', 'overexpression', '-θPFN1-', 'actin', '-', 'binding', 'defective', 'mutants', 'β', '-', 'actin', '(', 'C374S', ')', 'PFN1', '(', 'H119E', ')', ',', 'respectively', ',', 'reduced', 'coalescence', 'PKC', '-θc-SMAC', '.']


# Fine tune BERT with these tokens and labels, then evaluate against the test set

In [49]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model2 = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args2 = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer2 = Trainer(
    model2,
    args2,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [51]:
trainer2.train()


Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=0.19892782892160749, metrics={'train_runtime': 154.3124, 'train_samples_per_second': 41.682, 'train_steps_per_second': 10.42, 'total_flos': 241438486044480.0, 'train_loss': 0.19892782892160749, 'epoch': 6.0})

In [52]:
results = calculate_results(trainer2, tokenised_test)
results



{'0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.7157894736842105,
  'recall': 0.7640449438202247,
  'f1': 0.7391304347826086,
  'number': 267},
 '0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.6645962732919255,
  'recall': 0.7181208053691275,
  'f1': 0.6903225806451613,
  'number': 149},
 '0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0]': {'precision': 0.655536028119508,
  'recall': 0.6958955223880597,
  'f1': 0.6751131221719456,
  'number': 536},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6597222222222222,
  'recall': 0.7364341085271318,
  'f1': 0.6959706959706959,
  'number': 129},
 'overall_precision': 0.6721311475409836,
 'overall_recall': 0.7206290471785384,
 'overall_f1': 0.6955357142857144,
 'overall_accuracy': 0.9275296947827394}

# Results of stop-word removal
With removal of stopwords, the time to train has not changed much, but the F1 score has gotten considerably worse with a score of 0.7.
The main reason for this is likely a combination of two things


1.   Stop words make up a considerable amount of many texts, so  being able to correctly 'tag' them is important. By removing them from our training data we have essentially stopped our model from 'learning' about stop words which is quite important given the task at hand.
2.   By removing stop words, we remove some of the context from the text. This will have an effect on our models ability to 'fine-tune' itself accurately to our training data



# Lemmatization / Stemming
The final pre-processing technique I wish to apply is lemmatization

In [53]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [54]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("rocks"))

rock


# Lemmatize the tokens


In [55]:
def lemmatize_tokens(tokens):
  filtered_tokens = []
  for token in tokens:
    sublist = []
    for i in token:
      sublist.append(lemmatizer.lemmatize(i))

    filtered_tokens.append(sublist)
  return filtered_tokens



In [56]:
training = dataset["train"]
valid = dataset["validation"]
test = dataset["test"]

training_tokens = training["tokens"]
training_labels = training["ner_tags"]
valid_tokens = valid["tokens"]
valid_labels = valid["ner_tags"]
test_tokens = test["tokens"]
test_labels = test["ner_tags"]




In [57]:
lemmatized_train = lemmatize_tokens(training_tokens)
lemmatized_valid = lemmatize_tokens(valid_tokens)

label_list, val_label_list, test_label_list = conv_label_indexes(training_labels, valid_labels, test_labels)


tokenized_datasets = tokenize_and_align_labels(lemmatized_train, label_list)
tokenized_val_datasets = tokenize_and_align_labels(lemmatized_valid, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_tokens, test_label_list)

tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)



In [58]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model3 = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
model_name = "bert-base-uncased"
epochs = 6
batch_size = 4
learning_rate = 2e-5

args3 = TrainingArguments(
    f"BERT-finetuned-NER",
    # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
    evaluation_strategy ='steps',
    eval_steps = 7000,
    save_total_limit = 3,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.001,
    save_steps=35000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
)

trainer3 = Trainer(
    model3,
    args3,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    data_collator = data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [60]:
trainer3.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=1608, training_loss=0.17135502568524869, metrics={'train_runtime': 165.3449, 'train_samples_per_second': 38.901, 'train_steps_per_second': 9.725, 'total_flos': 277751226297792.0, 'train_loss': 0.17135502568524869, 'epoch': 6.0})

In [61]:
results = calculate_results(trainer3, tokenised_test)
results



{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.6756756756756757,
  'recall': 0.7490636704119851,
  'f1': 0.7104795737122558,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.6523972602739726,
  'recall': 0.710820895522388,
  'f1': 0.6803571428571429,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.6748466257668712,
  'recall': 0.738255033557047,
  'f1': 0.7051282051282051,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.6554054054054054,
  'recall': 0.751937984496124,
  'f1': 0.700361010830325,
  'number': 129},
 'overall_precision': 0.6616288832913518,
 'overall_recall': 0.72895467160037,
 'overall_f1': 0.6936619718309859,
 'overall_accuracy': 0.9236205081942565}