In [9]:
!nvidia-smi

Mon Mar 18 19:17:47 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10                     Off |   00000000:00:04.0 Off |                    0 |
|  0%   30C    P8             15W /  150W |       0MiB /  23028MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# 1 - Create a Hugging Face dataset from exported OCI Data Labeling CoNLL Dataset

To create a Hugging Face dataset with the format you described, you'll need to go through several steps, assuming you already have your dataset in the CoNLL format as shown. Here's a high-level overview of what needs to be done, including code snippets to guide you through the process:

Parse the CoNLL File: Extract the tokens and their corresponding NER tags from your CoNLL file.

Extract Unique NER Tags: Dynamically create a list of unique NER tags from your dataset, ensuring 'O' is the first tag in the list.

Prepare the Data: Organize your data into a structure that can be consumed by the Hugging Face datasets library, with each instance having an 'id', 'tokens', and 'ner_tags'.

Create a Hugging Face Dataset: Use the datasets library to create a DatasetDict with your data.

Here's how you can do it:

## Step 1: Parse the CoNLL File
You need to read your .conll file and extract the tokens and their NER tags. This function reads your .conll file and extracts sentences and their NER tags.



In [1]:
import re

def split_token(token, tag):
    """
    Splits tokens if they end with specific punctuation characters (.,;!?) and assigns
    'O' to the punctuation, leaving other tokens intact.
    """
    # Define the punctuations to split
    punctuations = ".,;!?"
    # Check if the token ends with a punctuation that should be split
    if token[-1] in punctuations:
        # Return the token without the last character and the punctuation as separate tokens
        return [(token[:-1], tag), (token[-1], 'O')]
    else:
        # Return the token as is if it doesn't end with specified punctuation
        return [(token, tag)]


def parse_conll_file(file_path):
    sentences = []
    current_sentence = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.startswith("-DOCSTART-") or line.strip() == "":
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.strip().split()
                token = parts[0]
                tag = parts[-1] if len(parts) > 1 else 'O'  # Default to 'O' if no tag is present
                # Split token if it contains punctuation
                current_sentence.extend(split_token(token, tag))
        if current_sentence:  # Add the last sentence if it exists
            sentences.append(current_sentence)
    return sentences


## Step 2: Extract Unique NER Tags and Prepare Data
This function extracts unique NER tags ensuring 'O' is first, and prepares the data for the dataset creation.


In [2]:
def prepare_dataset(sentences):
    """
    excluded_tags = {'B-BITCOINADDRESS', 'B-IP', 'B-MAC', 'B-PHONEIMEI', 'B-URL',
                     'I-BITCOINADDRESS', 'I-IP', 'I-MAC', 'I-PHONEIMEI', 'I-URL','B-CITY', 'B-COMPANYNAME',
                     'B-CURRENCYNAME', 'B-JOBAREA', 'B-JOBTYPE',
                      'B-PREFIX', 'B-SECONDARYADDRESS', 'B-STATE',  'B-TIME', 'B-USERNAME'
                     , 'I-COMPANYNAME', 'I-CURRENCYNAME', 'I-GENDER', 'I-SECONDARYADDRESS', 'I-STATE', 'I-TIME'}
    """

    unique_tags = set()
    for sentence in sentences:
        for _, tag in sentence:
            #if tag not in excluded_tags:
            unique_tags.add(tag)

    # Ensure 'O' is first, then sort the rest of the tags
    unique_tags.discard('O')  # Remove 'O' to avoid duplication
    unique_tags = ['O'] + sorted(unique_tags)  # Prepend 'O' and sort the rest

    tag_to_id = {tag: id for id, tag in enumerate(unique_tags)}

    # Prepare data for Hugging Face Dataset
    data = {'id': [], 'tokens': [], 'ner_tags': []}
    for i, sentence in enumerate(sentences):
        tokens, tags = zip(*sentence)
        data['id'].append(str(i))
        data['tokens'].append(list(tokens))
        data['ner_tags'].append([tag_to_id.get(tag, tag_to_id['O']) for tag in tags ]) #if tag not in excluded_tags

    return data, unique_tags


## Step 3: Create a Hugging Face Dataset
This function creates the dataset using the prepared data and unique NER tags.



In [3]:
from datasets import Dataset, DatasetDict, Features, ClassLabel, Sequence, Value

def create_hf_dataset(data, unique_tags):
    features= Features({
                'id': Value(dtype='string', id=None),
                'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
                'ner_tags': Sequence(feature=ClassLabel(num_classes=len(unique_tags), names=unique_tags))
            })

    dataset = Dataset.from_dict(data, features=features)
    dataset_dict = DatasetDict({'train': dataset})
    return dataset_dict


## Putting It All Together
Finally, use these functions together to parse your .conll file, prepare the data, and create the dataset.
To put it all together, you'd call these functions in sequence, providing the path to your CoNLL file:



In [4]:
import ocifs
import tempfile
import os

# Initialize OCI File System
fs = ocifs.OCIFileSystem()

# Specify the object storage directory path
object_storage_path = "book_oci_nlp_labeling_bucket@yz2wwgkgt8eh/taln_pii_cs_ds_060324_3000_export/book_oci_nlp_labeling_ds_060324_3000_exportbook_oci_nlp_labeling_ds_060324_3000_1709776877546.conll"

# List files in the object storage directory
#files = fs.glob(object_storage_path)

# Create a temporary directory
temp_dir = tempfile.mkdtemp()
print(f"Temporary directory created at: {temp_dir}")

# Read from OCI Object Storage and write to the local temp directory

file_name = "taln_pii_case_study_ds_Conll_export_3000.conll" #os.path.basename(file_path)
local_file_path = os.path.join(temp_dir, file_name)
    
# Open the remote file and write its contents to a local file
with fs.open(object_storage_path, 'rb') as remote_file, open(local_file_path, 'wb') as local_file:
    local_file.write(remote_file.read())
    print(f"Written {file_name} to the temporary directory: {local_file_path}")


Temporary directory created at: /tmp/tmpzkffxlns
Written taln_pii_case_study_ds_Conll_export_3000.conll to the temporary directory: /tmp/tmpzkffxlns/taln_pii_case_study_ds_Conll_export_3000.conll


In [6]:
!ls /tmp/tmpzkffxlns

taln_pii_case_study_ds_Conll_export_3000.conll


In [7]:
# Update this to your file's path
file_path = local_file_path #"./datasets/taln_pii_case_study_ds_Conll_export_3000.conll"

# Parse the .conll file
sentences = parse_conll_file(file_path)
# Prepare the dataset and extract unique NER tags
data, unique_tags = prepare_dataset(sentences)
data
# Create the Hugging Face dataset
dataset_dict = create_hf_dataset(data, unique_tags)
dataset_dict

# Optionally, save the dataset to disk
dataset_dict.save_to_disk(temp_dir+"/datasets/from_oci_2_huggingface_test")

print("Dataset created successfully!")


Saving the dataset (0/1 shards):   0%|          | 0/1838 [00:00<?, ? examples/s]

Dataset created successfully!


In [8]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1838
    })
})

In [10]:
from datasets import DatasetDict

ds_train_devtest = dataset_dict['train'].train_test_split(test_size=0.25, seed=42)
ds_devtest = ds_train_devtest['test'].train_test_split(test_size=0.25, seed=42)


raw_datasets = DatasetDict({
    'train': ds_train_devtest['train'],
    'validation': ds_devtest['train'],
    'test': ds_devtest['test']
})

print("Before:\n", dataset_dict)
print("\nAfter\n", raw_datasets)

Before:
 DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1838
    })
})

After
 DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1378
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 345
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 115
    })
})


In [11]:
ner_feature = dataset_dict["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-AGE', 'B-CREDITCARDNUMBER', 'B-EMAIL', 'B-FIRSTNAME', 'B-LASTNAME', 'B-MIDDLENAME', 'B-PHONENUMBER', 'B-STREET', 'B-ZIPCODE', 'I-AGE', 'I-PHONENUMBER', 'I-STREET'], id=None), length=-1, id=None)

In [None]:
#from datasets import load_dataset
#raw_datasets2 = load_dataset("conll2003")


In [12]:
raw_datasets["train"][4]["tokens"]

['Cher',
 'Stanley',
 ',',
 'veuillez',
 'mettre',
 'à',
 'jour',
 'votre',
 'iPad',
 'pour',
 'continuer',
 'à',
 'utiliser',
 'nos',
 'ressources',
 'éducatives',
 '.',
 'Les',
 'meilleures',
 'performances',
 'sont',
 'garanties',
 'avec',
 'Mozilla/5.0',
 '(X11',
 ';',
 'Linux',
 'x86_64',
 'AppleWebKit/536.2.1',
 '(KHTML',
 ',',
 'like',
 'Gecko)',
 'Chrome/25.0.849.0',
 'Safari/536.2.1',
 'et',
 'versions',
 'ultérieures',
 '.']

In [13]:
raw_datasets["train"][4]["ner_tags"]

[0,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [14]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-AGE', 'B-CREDITCARDNUMBER', 'B-EMAIL', 'B-FIRSTNAME', 'B-LASTNAME', 'B-MIDDLENAME', 'B-PHONENUMBER', 'B-STREET', 'B-ZIPCODE', 'I-AGE', 'I-PHONENUMBER', 'I-STREET'], id=None), length=-1, id=None)

# Training NER Model
Script based on # Token classification (PyTorch)

https://huggingface.co/learn/nlp-course/en/chapter7/2

https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter7/section2_pt.ipynb#scrollTo=Ff9u5jFitaJQ


Preparing the data
First things first, we need a dataset suitable for token classification. In this section we will use the CoNLL-2003 dataset, which contains news stories from Reuters.

💡 As long as your dataset consists of texts split into words with their corresponding labels, you will be able to adapt the data processing procedures described here to your own dataset. Refer back to Chapter 5 if you need a refresher on how to load your own custom data in a Dataset.

The CoNLL-2003 dataset
To load the CoNLL-2003 dataset, we use the load_dataset() method from the 🤗 Datasets library:

In [15]:
label_names = ner_feature.feature.names
label_names

['O',
 'B-AGE',
 'B-CREDITCARDNUMBER',
 'B-EMAIL',
 'B-FIRSTNAME',
 'B-LASTNAME',
 'B-MIDDLENAME',
 'B-PHONENUMBER',
 'B-STREET',
 'B-ZIPCODE',
 'I-AGE',
 'I-PHONENUMBER',
 'I-STREET']

In [16]:
words = raw_datasets["train"][2]["tokens"]
labels = raw_datasets["train"][2]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(len(words))
print(line2)
print(len(labels))


Bonjour Mr . Wolff      . N'oubliez pas de confirmer votre présence à notre événement annuel de la journée des carrières . Nous sommes ravis d'accueillir Future Division Supervisor de Keeling , Huel and Auer qui s'adressera aux étudiants en Program . 
41
O       O  O B-LASTNAME O O         O   O  O         O     O        O O     O         O      O  O  O       O   O         O O    O      O     O            O      O        O          O  O       O O    O   O    O   O           O   O         O  O       O 
41


In [17]:
from transformers import AutoTokenizer

model_checkpoint = "almanach/camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [18]:
tokenizer.is_fast

True

In [19]:
inputs = tokenizer(raw_datasets["train"][4]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 '▁Cher',
 '▁Stanley',
 '▁',
 ',',
 '▁veuillez',
 '▁mettre',
 '▁à',
 '▁jour',
 '▁votre',
 '▁iPad',
 '▁pour',
 '▁continuer',
 '▁à',
 '▁utiliser',
 '▁nos',
 '▁ressources',
 '▁éducative',
 's',
 '▁',
 '.',
 '▁Les',
 '▁meilleures',
 '▁performances',
 '▁sont',
 '▁garanties',
 '▁avec',
 '▁Mozilla',
 '/5',
 '.0',
 '▁(',
 'X',
 '11',
 '▁;',
 '▁Linux',
 '▁x',
 '86',
 '_',
 '64',
 '▁Apple',
 'Web',
 'K',
 'it',
 '/5',
 '36',
 '.',
 '2.',
 '1',
 '▁(',
 'K',
 'H',
 'TM',
 'L',
 '▁',
 ',',
 '▁like',
 '▁G',
 'eck',
 'o',
 ')',
 '▁Chrome',
 '/',
 '25',
 '.0',
 '.',
 '8',
 '49',
 '.0',
 '▁Safari',
 '/5',
 '36',
 '.',
 '2.',
 '1',
 '▁et',
 '▁versions',
 '▁ultérieure',
 's',
 '▁',
 '.',
 '</s>']

In [20]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 15,
 16,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 23,
 23,
 24,
 24,
 24,
 25,
 26,
 27,
 27,
 27,
 27,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 28,
 29,
 29,
 29,
 29,
 29,
 30,
 30,
 31,
 32,
 32,
 32,
 32,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 33,
 34,
 34,
 34,
 34,
 34,
 34,
 35,
 36,
 37,
 37,
 38,
 38,
 None]

In [21]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [22]:
labels = raw_datasets["train"][4]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-100, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [23]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [24]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/1378 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Map:   0%|          | 0/115 [00:00<?, ? examples/s]

In [25]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [26]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    4,    4,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    4,    4,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0, 

In [27]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]
[-100, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]


In [28]:
%%capture
pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [29]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [30]:
labels = raw_datasets["train"][4]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['O',
 'B-FIRSTNAME',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [31]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'FIRSTNAME': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [32]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [33]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [34]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
 #   num_labels=len(label_names),
)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
model.config.num_labels

13

### Important note for the book : 
we need to add this param : load_best_model_at_end
in order to have trainer.state.best_model_checkpoint

In [41]:
from transformers import TrainingArguments

args = TrainingArguments(
    temp_dir+"/models/taln_pii_ner2",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [42]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Detected kernel version 5.4.17, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.270172,0.748596,0.733048,0.740741,0.957309
2,No log,0.195411,0.770616,0.810629,0.790116,0.972032
3,0.252300,0.17126,0.784131,0.821014,0.802149,0.973838
4,0.252300,0.154127,0.814201,0.840562,0.827172,0.977396
5,0.252300,0.145721,0.846814,0.844227,0.845519,0.980516


Checkpoint destination directory /tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-173 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-346 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-519 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-692 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory /tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-865 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=865, training_loss=0.2052276082121568, metrics={'train_runtime': 211.9013, 'train_samples_per_second': 32.515, 'train_steps_per_second': 4.082, 'total_flos': 314452922926272.0, 'train_loss': 0.2052276082121568, 'epoch': 5.0})

In [44]:
best_ner_model_checkpoint = trainer.state.best_model_checkpoint
best_ner_model_checkpoint

'/tmp/tmpzkffxlns/models/taln_pii_ner2/checkpoint-865'

In [45]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = best_ner_model_checkpoint #"/models/taln_pii_ner2/checkpoint-865"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="first" #"simple"
)
token_classifier("Mon nom est Hicham, et pour l'anniversaire de 50ans et je vais payer avec une carte visa 4442223314488.")

[{'entity_group': 'FIRSTNAME',
  'score': 0.690727,
  'word': 'Hicham,',
  'start': 12,
  'end': 19},
 {'entity_group': 'CREDITCARDNUMBER',
  'score': 0.48762363,
  'word': '4442223314488.',
  'start': 89,
  'end': 103}]

#### After training copy NER dataset and model to Object Storage 
(share model between GPU notebook and CPU notebook)

In [47]:
import oci
import os

# Initialize OCI File System
#fs = ocifs.OCIFileSystem()

# Initialize OCI Object Storage Client with notebook session's resource principal
signer = oci.auth.signers.get_resource_principals_signer()
object_storage_client = oci.object_storage.ObjectStorageClient(config={}, signer=signer)


# Local directory containing the model files
local_model_dir = best_ner_model_checkpoint #"./taln_pii_cs_models_local/taln_pii_cs_model"

# OCI Object Storage path where the model will be copied
#object_storage_path = "taln_pii_cs_models@yz2wwgkgt8eh/taln_pii_cs_model_new/"
# Initialize Object Storage bucket infos
namespace = object_storage_client.get_namespace().data #"yz2wwgkgt8eh"
bucket_name = "book_oci_nlp_training_bucket" #"ner_ar_iob_bucket"
# Base folder in the bucket
object_storage_folder = "taln_pii_cs_model_trained-on-oci"


# List all files in the local directory
local_files = os.listdir(local_model_dir)

# Copy each file from the local directory to the OCI Object Storage
for file_name in local_files:
    local_file_path = os.path.join(local_model_dir, file_name)
    # Construct the full OCI Object Storage path for the file
    object_storage_path =  object_storage_folder +'/'+ file_name

    # Open the local file and read its content
    with open(local_file_path, 'rb') as f:
        #file_content = local_file.read()

        object_storage_client.put_object(namespace, 
                                         bucket_name, 
                                         object_storage_path, 
                                         f,
                                         #content_type='text/plain'
                                        )
        print(f"Copied {file_name} to OCI Object Storage: {object_storage_path}")
            
            
    # Write the content to the new location in OCI Object Storage
    #object_storage_client.put(oci_file_path, file_content)
    #print(f"Copied {file_name} to OCI Object Storage: {oci_file_path}")


Copied special_tokens_map.json to OCI Object Storage: taln_pii_cs_model_trained-on-oci/special_tokens_map.json
Copied tokenizer.json to OCI Object Storage: taln_pii_cs_model_trained-on-oci/tokenizer.json
Copied config.json to OCI Object Storage: taln_pii_cs_model_trained-on-oci/config.json
Copied trainer_state.json to OCI Object Storage: taln_pii_cs_model_trained-on-oci/trainer_state.json
Copied training_args.bin to OCI Object Storage: taln_pii_cs_model_trained-on-oci/training_args.bin
Copied model.safetensors to OCI Object Storage: taln_pii_cs_model_trained-on-oci/model.safetensors
Copied optimizer.pt to OCI Object Storage: taln_pii_cs_model_trained-on-oci/optimizer.pt
Copied scheduler.pt to OCI Object Storage: taln_pii_cs_model_trained-on-oci/scheduler.pt
Copied rng_state.pth to OCI Object Storage: taln_pii_cs_model_trained-on-oci/rng_state.pth
Copied sentencepiece.bpe.model to OCI Object Storage: taln_pii_cs_model_trained-on-oci/sentencepiece.bpe.model
Copied added_tokens.json to OC

In [48]:
temp_dir

'/tmp/tmpzkffxlns'

In [50]:
!ls /tmp/tmpzkffxlns/datasets

from_oci_2_huggingface_test


In [52]:
import os

object_storage_dataset_folder = "datasets/from_oci_2_huggingface_test"
local_dataset_dir = os.path.join(temp_dir, "datasets/from_oci_2_huggingface_test")

# Traverse the directory tree and upload all files
for root, dirs, files in os.walk(local_dataset_dir):
    for file_name in files:
        # Construct the full local path
        local_file_path = os.path.join(root, file_name)
        
        # Construct the relative path within the local directory structure
        relative_path = os.path.relpath(local_file_path, local_dataset_dir)
        
        # Construct the full OCI Object Storage path for the file
        oci_object_path = os.path.join(object_storage_dataset_folder, relative_path)

        # Upload the file to OCI Object Storage
        with open(local_file_path, 'rb') as f:
            object_storage_client.put_object(namespace, bucket_name, oci_object_path, f)
            print(f"Copied {file_name} to OCI Object Storage: {oci_object_path}")


Copied dataset_dict.json to OCI Object Storage: datasets/from_oci_2_huggingface_test/dataset_dict.json
Copied dataset_info.json to OCI Object Storage: datasets/from_oci_2_huggingface_test/train/dataset_info.json
Copied state.json to OCI Object Storage: datasets/from_oci_2_huggingface_test/train/state.json
Copied data-00000-of-00001.arrow to OCI Object Storage: datasets/from_oci_2_huggingface_test/train/data-00000-of-00001.arrow
