# Finetuning BLOOM for NER: Preprocess Corpus
 

## Imports

In [1]:
from transformers import (BloomTokenizerFast,
                          BloomForTokenClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification, 
                          TrainingArguments, Trainer)
from datasets import load_dataset, concatenate_datasets, DatasetDict
import pickle
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


## Load Tokenizer

The list of available Models can be found here: https://huggingface.co/docs/transformers/model_doc/bloom

In [2]:
model_name = "bloom-560m"
tokenizer = BloomTokenizerFast.from_pretrained(f"bigscience/{model_name}", add_prefix_space=True)
#model = BloomForTokenClassification.from_pretrained(f"bigscience/{model_name}")

## Load Dataset

In [3]:
data_path = "./data/dataset_multilingual.pkl"
with open(data_path, 'rb') as pickle_file:
    dataset = pickle.load(file=pickle_file)

## Tokenize Dataset

### Tokenize a Single Sample:

In [4]:
example = dataset["train"][50]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['List', 'of', 'Fly', 'wheel', ',', 'Sh', 'yster', ',', 'and', 'Fly', 'wheel', '(', '1990', 'radio', 'series', ')', 'epis', 'odes']


Sample after Tokenization:

In [5]:
tokenized_input

{'input_ids': [4378, 3825, 141473, 212546, 15, 8027, 182848, 15, 392, 141473, 212546, 11, 50539, 57113, 79266, 12, 132129, 8694], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Word IDs:

In [6]:
tokenized_input.word_ids()

[0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 11, 12, 13, 13]

### Tokenize Whole Dataset

In [7]:
def tokenizeInputs(inputs):
    
    tokenized_inputs = tokenizer(inputs["tokens"], max_length = 2048, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    ner_tags = inputs["ner_tags"]
    labels = [ner_tags[word_id] for word_id in word_ids]
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [8]:
example = dataset["train"][100]
tokenizeInputs(example)

{'input_ids': [4378, 3825, 152605, 265, 177941], 'attention_mask': [1, 1, 1, 1, 1], 'labels': [3, 4, 4, 4, 4]}

In [9]:
tokenized_dataset = dataset.map(tokenizeInputs)

Loading cached processed dataset at /home/julian/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-1dd4ec44a00403f4.arrow
Loading cached processed dataset at /home/julian/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-383eff4844864629.arrow
Loading cached processed dataset at /home/julian/.cache/huggingface/datasets/wikiann/en/1.1.0/4bfd4fe4468ab78bb6e096968f61fab7a888f44f9d3371c2f3fea7e74a5a354e/cache-939314c6a18cfc0a.arrow


**Shuffle Dataset:**

In [10]:
tokenized_dataset = tokenized_dataset.shuffle()

**Count of Tokens in the Training Set:**

In [11]:
token_count = 0
for sample in tokenized_dataset["train"]:
    token_count = token_count + len(sample["labels"])
    
print("Tokens in Training Set:", token_count)

Tokens in Training Set: 1486538


**Remove unnecessary columns:**

In [12]:
tokenized_dataset = tokenized_dataset.remove_columns(["tokens", "ner_tags", "langs", "spans"])

**Save processed Dataset:**

In [13]:
data_path = "./data/dataset_processed.pkl"
with open(data_path, 'wb') as pickle_file:
    pickle.dump(obj = dataset, file=pickle_file)