In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [4]:
import torch
from transformers import BertForTokenClassification, BertTokenizer,  BertForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import os
from transformers import BertTokenizerFast
import evaluate
import pandas as pd
import random

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


  return torch._C._cuda_getDeviceCount() > 0


In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [5]:
dataset = load_dataset("wnut_17", trust_remote_code = True)

In [6]:
dataset['train'][0]['tokens']

['@paulwalk',
 'It',
 "'s",
 'the',
 'view',
 'from',
 'where',
 'I',
 "'m",
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'ESB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.']

In [7]:
dataset['train'][0]['ner_tags']

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 8,
 8,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [14]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
num_labels = len(label_list)
label_list



['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [9]:
sample_data = dataset["train"].select(range(10)) 

df = pd.DataFrame({
    "tokens": [" ".join(example["tokens"]) for example in sample_data],
    "ner_tags": [example["ner_tags"] for example in sample_data]
})

df

Unnamed: 0,tokens,ner_tags
0,@paulwalk It 's the view from where I 'm livin...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, ..."
1,From Green Newsfeed : AHFA extends deadline fo...,"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Pxleyes Top 50 Photography Contest Pictures of...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,today is my last day at the office .,"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"4Dbling 's place til monday , party party part...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,watching the VMA pre-show again lol it was n't...,"[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,27 followers ! 30 followers is my goal for tod...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"This is the 2nd hospital ive been in today , b...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,Friday Night Eats http://twitpic.com/2pdvtr,"[0, 0, 0, 0]"
9,Gotta dress up for london fashion week and par...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


B- indicates the beginning of an entity.
I- indicates a token is contained inside the same entity (for example, the State token is a part of an entity like Empire State Building).
0 indicates the token doesn’t correspond to any entity.

In [10]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [11]:
# example = dataset["train"][0]
# tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# tokens

In [19]:


# Define a function to tokenize and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize dataset and align labels
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

# Load the model for token classification
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Load the accuracy  and f1 metric
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Convert predictions and labels from label indices to integers
    true_predictions_flat = [int(item) for sublist in true_predictions for item in sublist]
    true_labels_flat = [int(item) for sublist in true_labels for item in sublist]

    incorrect_examples = []
    for idx, (pred, label) in enumerate(zip(true_predictions, true_labels)):
        if pred != label:
            tokens = tokenized_datasets["validation"]["tokens"][idx]  # Original tokens for the example
            incorrect_examples.append({
                "tokens": tokens,
                "true_labels": label,
                "predicted_labels": pred
            })

    # Save incorrect examples to a CSV file if any errors are present
    if incorrect_examples:
        df_incorrect = pd.DataFrame(incorrect_examples)
        df_incorrect.to_csv("incorrect_predictions_trained.csv", index=False)

    accuracy = metric.compute(predictions=true_predictions_flat, references=true_labels_flat)
    f1 = f1_metric.compute(predictions=true_predictions_flat, references=true_labels_flat, average="weighted")["f1"]

    return {"accuracy": accuracy, "f1": f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)




Map: 100%|██████████| 3394/3394 [00:00<00:00, 16280.79 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [53]:
tokenized_datasets["train"]

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 3394
})

In [13]:
# Evaluate the model
trainer.evaluate()

100%|██████████| 127/127 [01:03<00:00,  1.99it/s]


{'eval_loss': 2.6555144786834717,
 'eval_model_preparation_time': 0.0015,
 'eval_accuracy': {'accuracy': 0.020275853301976735},
 'eval_f1': 0.02978602208907957,
 'eval_runtime': 64.2228,
 'eval_samples_per_second': 15.711,
 'eval_steps_per_second': 1.977}

In [14]:
trainer.train()

 33%|███▎      | 425/1275 [09:58<16:04,  1.14s/it]
 33%|███▎      | 425/1275 [10:50<16:04,  1.14s/it]

{'eval_loss': 0.22069907188415527, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': {'accuracy': 0.9497235110913367}, 'eval_f1': 0.9369062659629562, 'eval_runtime': 51.7912, 'eval_samples_per_second': 19.482, 'eval_steps_per_second': 2.452, 'epoch': 1.0}


 39%|███▉      | 500/1275 [12:35<17:52,  1.38s/it]  

{'loss': 0.1921, 'grad_norm': 3.9897754192352295, 'learning_rate': 1.215686274509804e-05, 'epoch': 1.18}


 67%|██████▋   | 850/1275 [20:13<07:44,  1.09s/it]
 67%|██████▋   | 850/1275 [21:04<07:44,  1.09s/it]

{'eval_loss': 0.22599846124649048, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': {'accuracy': 0.9529015445242484}, 'eval_f1': 0.9445694066365404, 'eval_runtime': 50.8077, 'eval_samples_per_second': 19.859, 'eval_steps_per_second': 2.5, 'epoch': 2.0}


 78%|███████▊  | 1000/1275 [24:32<06:14,  1.36s/it] 

{'loss': 0.0676, 'grad_norm': 2.1176671981811523, 'learning_rate': 4.313725490196079e-06, 'epoch': 2.35}


100%|██████████| 1275/1275 [30:52<00:00,  1.12s/it]
100%|██████████| 1275/1275 [31:45<00:00,  1.49s/it]

{'eval_loss': 0.24843436479568481, 'eval_model_preparation_time': 0.0015, 'eval_accuracy': {'accuracy': 0.954427000572046}, 'eval_f1': 0.9474851760387836, 'eval_runtime': 52.0289, 'eval_samples_per_second': 19.393, 'eval_steps_per_second': 2.441, 'epoch': 3.0}
{'train_runtime': 1905.083, 'train_samples_per_second': 5.345, 'train_steps_per_second': 0.669, 'train_loss': 0.11149864720363242, 'epoch': 3.0}





TrainOutput(global_step=1275, training_loss=0.11149864720363242, metrics={'train_runtime': 1905.083, 'train_samples_per_second': 5.345, 'train_steps_per_second': 0.669, 'total_flos': 665197041756672.0, 'train_loss': 0.11149864720363242, 'epoch': 3.0})

# AUG

# Augmentation

In [1]:
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
import random
import torch
from datasets import DatasetDict

# Load the tokenizer and Masked Language Model
tokenizer_mlm = BertTokenizer.from_pretrained("bert-base-cased")
mlm_model = BertForMaskedLM.from_pretrained("bert-base-cased")

# Define a data collator for language modeling to automatically mask tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_mlm,
    mlm=True,
    mlm_probability=0.15
)

# Define a function to perform token masking and augmentation using MLM
def augment_data_with_mlm(examples):
    # Tokenize input tokens
    inputs = tokenizer_mlm(
        examples["tokens"],
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Prepare inputs for data collator
    batch_for_collator = [{"input_ids": input_ids} for input_ids in inputs["input_ids"]]
    
    # Mask tokens using the data collator
    collated_batch = data_collator(batch_for_collator)
    
    with torch.no_grad():
        # Get MLM predictions for masked tokens
        outputs = mlm_model(**collated_batch)
        predictions = torch.argmax(outputs.logits, dim=-1)

    # Replace masked tokens with predictions in input tokens
    augmented_tokens = []
    for idx, token_ids in enumerate(collated_batch["input_ids"]):
        tokens = tokenizer_mlm.convert_ids_to_tokens(token_ids)
        predicted_tokens = tokenizer_mlm.convert_ids_to_tokens(predictions[idx])
        augmented_example = [
            predicted_tokens[i] if token == tokenizer_mlm.mask_token else token
            for i, token in enumerate(tokens)
        ]
        augmented_tokens.append(augmented_example)

    # Reformat the augmented data to align with the original dataset structure
    augmented_examples = {"tokens": augmented_tokens, "ner_tags": examples["ner_tags"]}
    return augmented_examples

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model

In [None]:
from datasets import concatenate_datasets

# Apply augmentation to the training dataset
augmented_dataset = dataset["train"].map(augment_data_with_mlm, batched=True)

# Combine original and augmented data using concatenate_datasets
train_dataset_combined = concatenate_datasets([dataset["train"], augmented_dataset])

# Wrap it back into a DatasetDict for compatibility with Trainer
combined_datasets = DatasetDict({"train": train_dataset_combined, "validation": dataset["validation"]})



In [15]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Define a function to tokenize and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Ensure word_idx is within bounds of the label list
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [21]:
# Tokenize the combined dataset for NER training
tokenized_combined_datasets = combined_datasets.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Reinitialize the Trainer with the augmented dataset
trainer_augmented = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined_datasets["train"],
    eval_dataset=tokenized_combined_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the NER model on augmented data
trainer_augmented.train()


  trainer_augmented = Trainer(
 20%|█▉        | 500/2547 [11:01<45:56,  1.35s/it]

{'loss': 0.268, 'grad_norm': 2.4735350608825684, 'learning_rate': 1.607381232822929e-05, 'epoch': 0.59}


                                                  
 33%|███▎      | 849/2547 [20:12<33:17,  1.18s/it]

{'eval_loss': 0.2131391167640686, 'eval_accuracy': {'accuracy': 0.9525837411809572}, 'eval_f1': 0.9421620495851264, 'eval_runtime': 53.2406, 'eval_samples_per_second': 18.952, 'eval_steps_per_second': 2.385, 'epoch': 1.0}


 39%|███▉      | 1000/2547 [23:40<34:42,  1.35s/it] 

{'loss': 0.1778, 'grad_norm': 1.8735758066177368, 'learning_rate': 1.2147624656458579e-05, 'epoch': 1.18}


 59%|█████▉    | 1500/2547 [35:37<23:36,  1.35s/it]

{'loss': 0.1464, 'grad_norm': 4.771763801574707, 'learning_rate': 8.221436984687869e-06, 'epoch': 1.77}


                                                   
 67%|██████▋   | 1698/2547 [41:04<16:56,  1.20s/it]

{'eval_loss': 0.2352842390537262, 'eval_accuracy': {'accuracy': 0.953600711879489}, 'eval_f1': 0.9460097377846362, 'eval_runtime': 50.1798, 'eval_samples_per_second': 20.108, 'eval_steps_per_second': 2.531, 'epoch': 2.0}


 79%|███████▊  | 2000/2547 [47:51<12:26,  1.36s/it]  

{'loss': 0.1212, 'grad_norm': 1.446900486946106, 'learning_rate': 4.295249312917158e-06, 'epoch': 2.36}


 98%|█████████▊| 2500/2547 [59:22<01:03,  1.36s/it]

{'loss': 0.1062, 'grad_norm': 2.0176374912261963, 'learning_rate': 3.6906164114644683e-07, 'epoch': 2.94}


                                                     
100%|██████████| 2547/2547 [1:01:17<00:00,  1.44s/it]

{'eval_loss': 0.24378447234630585, 'eval_accuracy': {'accuracy': 0.9548083645839954}, 'eval_f1': 0.9485204388055916, 'eval_runtime': 50.5788, 'eval_samples_per_second': 19.949, 'eval_steps_per_second': 2.511, 'epoch': 3.0}
{'train_runtime': 3677.8938, 'train_samples_per_second': 5.537, 'train_steps_per_second': 0.693, 'train_loss': 0.16278490535315224, 'epoch': 3.0}





TrainOutput(global_step=2547, training_loss=0.16278490535315224, metrics={'train_runtime': 3677.8938, 'train_samples_per_second': 5.537, 'train_steps_per_second': 0.693, 'total_flos': 1330394083513344.0, 'train_loss': 0.16278490535315224, 'epoch': 3.0})