In [53]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [54]:
import torch
from transformers import BertForTokenClassification, BertTokenizer,  BertForMaskedLM, Trainer, TrainingArguments
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import os
from transformers import BertTokenizerFast
import evaluate
import pandas as pd
import random

In [55]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [56]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [57]:
dataset = load_dataset("wnut_17", trust_remote_code = True)

In [58]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [59]:
dataset['train'][0]['tokens']

['@paulwalk',
 'It',
 "'s",
 'the',
 'view',
 'from',
 'where',
 'I',
 "'m",
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'Empire',
 'State',
 'Building',
 '=',
 'ESB',
 '.',
 'Pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.']

In [60]:
dataset['train'][0]['ner_tags']

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 7,
 8,
 8,
 0,
 7,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [61]:
dataset['train'][0]['id']

'0'

In [62]:
label_list = dataset["train"].features[f"ner_tags"].feature.names
num_labels = len(label_list)
label_list



['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

In [63]:
sample_data = dataset["train"].select(range(10)) 

df = pd.DataFrame({
    "tokens": [" ".join(example["tokens"]) for example in sample_data],
    "ner_tags": [example["ner_tags"] for example in sample_data]
})

df

Unnamed: 0,tokens,ner_tags
0,@paulwalk It 's the view from where I 'm livin...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, ..."
1,From Green Newsfeed : AHFA extends deadline fo...,"[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Pxleyes Top 50 Photography Contest Pictures of...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,today is my last day at the office .,"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"4Dbling 's place til monday , party party part...","[9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,watching the VMA pre-show again lol it was n't...,"[0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,27 followers ! 30 followers is my goal for tod...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"This is the 2nd hospital ive been in today , b...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,Friday Night Eats http://twitpic.com/2pdvtr,"[0, 0, 0, 0]"
9,Gotta dress up for london fashion week and par...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


B- indicates the beginning of an entity.
I- indicates a token is contained inside the same entity (for example, the State token is a part of an entity like Empire State Building).
0 indicates the token doesn’t correspond to any entity.

In [64]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [65]:
# example = dataset["train"][0]
# tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# tokens

In [66]:


# Define a function to tokenize and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs





In [67]:
# Tokenize dataset and align labels
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [68]:
# Load the model for token classification
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

# Load the accuracy  and f1 metric
metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Convert predictions and labels from label indices to integers
    true_predictions_flat = [int(item) for sublist in true_predictions for item in sublist]
    true_labels_flat = [int(item) for sublist in true_labels for item in sublist]

    incorrect_examples = []
    for idx, (pred, label) in enumerate(zip(true_predictions, true_labels)):
        if pred != label:
            tokens = tokenized_datasets["validation"]["tokens"][idx]  # Original tokens for the example
            incorrect_examples.append({
                "tokens": tokens,
                "true_labels": label,
                "predicted_labels": pred
            })

    # Save incorrect examples to a CSV file if any errors are present
    if incorrect_examples:
        df_incorrect = pd.DataFrame(incorrect_examples)
        df_incorrect.to_csv("incorrect_predictions_trained.csv", index=False)

    accuracy = metric.compute(predictions=true_predictions_flat, references=true_labels_flat)
    f1 = f1_metric.compute(predictions=true_predictions_flat, references=true_labels_flat, average="weighted")["f1"]

    return {"accuracy": accuracy, "f1": f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)




Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [69]:
# Evaluate the model
trainer.evaluate()

100%|██████████| 127/127 [01:19<00:00,  1.61it/s]


{'eval_loss': 2.286158561706543,
 'eval_model_preparation_time': 0.0058,
 'eval_accuracy': {'accuracy': 0.3842878027076845},
 'eval_f1': 0.5238294506471016,
 'eval_runtime': 79.6158,
 'eval_samples_per_second': 12.673,
 'eval_steps_per_second': 1.595}

In [70]:
trainer.train()

 33%|███▎      | 425/1275 [12:27<17:42,  1.25s/it]
 33%|███▎      | 425/1275 [13:27<17:42,  1.25s/it]

{'eval_loss': 0.2393532544374466, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': {'accuracy': 0.9480073730375643}, 'eval_f1': 0.9338079697302328, 'eval_runtime': 59.8076, 'eval_samples_per_second': 16.871, 'eval_steps_per_second': 2.123, 'epoch': 1.0}


 39%|███▉      | 500/1275 [15:22<19:43,  1.53s/it]  

{'loss': 0.1944, 'grad_norm': 3.5252625942230225, 'learning_rate': 1.215686274509804e-05, 'epoch': 1.18}


 67%|██████▋   | 850/1275 [31:05<12:27,  1.76s/it]   
 67%|██████▋   | 850/1275 [32:24<12:27,  1.76s/it]

{'eval_loss': 0.23499077558517456, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': {'accuracy': 0.9541091972287549}, 'eval_f1': 0.9455741537651237, 'eval_runtime': 79.2638, 'eval_samples_per_second': 12.73, 'eval_steps_per_second': 1.602, 'epoch': 2.0}


 78%|███████▊  | 1000/1275 [37:32<09:17,  2.03s/it] 

{'loss': 0.0729, 'grad_norm': 1.2228084802627563, 'learning_rate': 4.313725490196079e-06, 'epoch': 2.35}


                                                     
100%|██████████| 1275/1275 [1:04:52<00:00,  3.05s/it]

{'eval_loss': 0.25453439354896545, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': {'accuracy': 0.9543634399033878}, 'eval_f1': 0.9475854398539034, 'eval_runtime': 61.064, 'eval_samples_per_second': 16.524, 'eval_steps_per_second': 2.08, 'epoch': 3.0}
{'train_runtime': 3892.6399, 'train_samples_per_second': 2.616, 'train_steps_per_second': 0.328, 'train_loss': 0.11507972044103286, 'epoch': 3.0}





TrainOutput(global_step=1275, training_loss=0.11507972044103286, metrics={'train_runtime': 3892.6399, 'train_samples_per_second': 2.616, 'train_steps_per_second': 0.328, 'total_flos': 665197041756672.0, 'train_loss': 0.11507972044103286, 'epoch': 3.0})

In [71]:
trainer.evaluate()

100%|██████████| 127/127 [01:00<00:00,  2.10it/s]


{'eval_loss': 0.25453439354896545,
 'eval_model_preparation_time': 0.0058,
 'eval_accuracy': {'accuracy': 0.9543634399033878},
 'eval_f1': 0.9475854398539034,
 'eval_runtime': 60.7897,
 'eval_samples_per_second': 16.598,
 'eval_steps_per_second': 2.089,
 'epoch': 3.0}

In [None]:
bgbgng

In [79]:
import torch
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd
import numpy as np

# Get the test dataset
test_dataset = tokenized_datasets["test"]

# Define a function to get embeddings from the model
def get_embeddings(dataset, model, tokenizer):
    model.eval()
    embeddings = []
    labels = []
    tokens = []
    
    with torch.no_grad():
        for example in dataset:
            inputs = tokenizer(
                example["tokens"], return_tensors="pt", truncation=True, padding='max_length', max_length=128
            )
            outputs = model(**inputs, output_hidden_states=True)
            # Get the token embeddings (hidden states from the last layer)
            token_embeddings = outputs.hidden_states[-1][0]
            embeddings.append(token_embeddings.cpu().numpy())
            labels.extend(example["ner_tags"])
            tokens.extend(example["tokens"])

    # Flatten the embeddings, labels, and tokens for each token
    flat_embeddings = np.vstack([embedding[:len(label)] for embedding, label in zip(embeddings, dataset["ner_tags"])])
    flat_labels = [label for sublist in dataset["ner_tags"] for label in sublist]
    flat_tokens = [token for sublist in dataset["tokens"] for token in sublist]
    
    return flat_embeddings, flat_labels, flat_tokens

# Get embeddings, labels, and tokens
embeddings, labels, tokens = get_embeddings(test_dataset, model, tokenizer)

# Reduce dimensionality with t-SNE or UMAP
reducer = TSNE(n_components=2, random_state=42)
reduced_embeddings = reducer.fit_transform(embeddings)

# Create a DataFrame for plotting
df = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "label": labels,
    "token": tokens
})

# Plot with Plotly for interactivity
fig = px.scatter(
    df, x="x", y="y", color="label", hover_data=["token"],
    title="2D Vector Space of Test Examples",
    labels={"color": "NER Label"}
)
fig.show()


# AUG

# Augmentation

In [72]:
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
import random
import torch
from datasets import DatasetDict

# Load the tokenizer and Masked Language Model
tokenizer_mlm = BertTokenizer.from_pretrained("bert-base-cased")
mlm_model = BertForMaskedLM.from_pretrained("bert-base-cased")



Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [73]:
# Define a data collator for language modeling to automatically mask tokens
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer_mlm,
    mlm=True,
    mlm_probability=0.15
)

# Define a function to perform token masking and augmentation using MLM
def augment_data_with_mlm(examples):
    # Lista do przechowywania wyników dla zapisu do CSV
    results = []

    # Tokenizowanie danych wejściowych
    inputs = tokenizer_mlm(
        examples["tokens"],
        is_split_into_words=True,
        padding='max_length',
        truncation=True,
        max_length= 64,
        return_tensors="pt"
    )

    # Przygotowanie danych dla collatora
    batch_for_collator = [{"input_ids": input_ids} for input_ids in inputs["input_ids"]]
    
    # Maskowanie tokenów za pomocą data collator
    collated_batch = data_collator(batch_for_collator)
    
    with torch.no_grad():
        # Uzyskanie przewidywań modelu MLM dla zamaskowanych tokenów
        outputs = mlm_model(**collated_batch)
        predictions = torch.argmax(outputs.logits, dim=-1)


    # Zamiana zamaskowanych tokenów na przewidywane wartości
    augmented_tokens = []
    original_texts = []
    augmented_texts = []
    
    for idx, token_ids in enumerate(collated_batch["input_ids"]):
        tokens = tokenizer_mlm.convert_ids_to_tokens(token_ids)
        predicted_tokens = tokenizer_mlm.convert_ids_to_tokens(predictions[idx])

        # Generowanie zamaskowanych tokenów z przewidywaniami
        masked_with_predictions = [
            token if token != tokenizer_mlm.mask_token else f"[MASKED] ({predicted_tokens[i]})"
            for i, token in enumerate(tokens)
        ]
        
        # Dodanie danych do listy wyników
        results.append({
            "original_tokens": ' '.join(examples["tokens"][idx]),
            "masked_tokens_with_predictions": ' '.join(masked_with_predictions)
        })
        
        # Tworzenie nowego przykładu z zamaskowanymi tokenami zastąpionymi przewidywaniami
        augmented_example = [
            predicted_tokens[i] if token == tokenizer_mlm.mask_token else token
            for i, token in enumerate(tokens)
        ]
        augmented_tokens.append(augmented_example)
        
        # Zapisanie oryginalnych i zaugmentowanych tekstów jako ciągi tekstowe
        original_text = ' '.join(examples["tokens"][idx])
        augmented_text = ' '.join(augmented_example)
        
        original_texts.append(original_text)
        augmented_texts.append(augmented_text)

    # Zapisanie wyników do CSV
    df = pd.DataFrame(results)
    df.to_csv("masked_tokens_with_predictions.csv", index=False)
    print("Zapisano dane do masked_tokens_with_predictions.csv")
    
    # Tworzenie nowego DataFrame z oryginalnymi i zaugmentowanymi tekstami
    df_augmented = pd.DataFrame({
        "original_text": original_texts,
        "augmented_text": augmented_texts
    })
    
    # Zapisanie nowej ramki danych do pliku CSV
    df_augmented.to_csv("original_and_augmented_texts.csv", index=False)
    print("Zapisano dane do original_and_augmented_texts.csv")
    
    # Reformatujemy dane wyjściowe, aby były zgodne ze strukturą oryginalnego zestawu danych
    augmented_examples = {"tokens": augmented_tokens, "ner_tags": examples["ner_tags"]}
    return augmented_examples

In [74]:
# 

Map: 100%|██████████| 1/1 [00:00<00:00, 19.61 examples/s]

Zapisano dane do masked_tokens_with_predictions.csv
Zapisano dane do original_and_augmented_texts.csv





In [75]:
from datasets import concatenate_datasets

# Apply augmentation to the training dataset
augmented_dataset = dataset["train"].map(augment_data_with_mlm, batched=True)

# Combine original and augmented data using concatenate_datasets
train_dataset_combined = concatenate_datasets([dataset["train"], augmented_dataset])

# Wrap it back into a DatasetDict for compatibility with Trainer
combined_datasets = DatasetDict({"train": train_dataset_combined, "validation": dataset["validation"]})



Map:   0%|          | 0/3394 [00:09<?, ? examples/s]


KeyboardInterrupt: 

In [33]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
model = BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_labels)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Define a function to tokenize and align the labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        max_length=128

    )
    labels = []
    for i, label in enumerate(examples['ner_tags']):
        print(i, label)
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # Ensure word_idx is within bounds of the label list
                if word_idx < len(label):
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [21]:
# Tokenize the combined dataset for NER training
tokenized_combined_datasets = combined_datasets.map(tokenize_and_align_labels, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Reinitialize the Trainer with the augmented dataset
trainer_augmented = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_combined_datasets["train"],
    eval_dataset=tokenized_combined_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the NER model on augmented data
trainer_augmented.train()


  trainer_augmented = Trainer(
 20%|█▉        | 500/2547 [11:01<45:56,  1.35s/it]

{'loss': 0.268, 'grad_norm': 2.4735350608825684, 'learning_rate': 1.607381232822929e-05, 'epoch': 0.59}


                                                  
 33%|███▎      | 849/2547 [20:12<33:17,  1.18s/it]

{'eval_loss': 0.2131391167640686, 'eval_accuracy': {'accuracy': 0.9525837411809572}, 'eval_f1': 0.9421620495851264, 'eval_runtime': 53.2406, 'eval_samples_per_second': 18.952, 'eval_steps_per_second': 2.385, 'epoch': 1.0}


 39%|███▉      | 1000/2547 [23:40<34:42,  1.35s/it] 

{'loss': 0.1778, 'grad_norm': 1.8735758066177368, 'learning_rate': 1.2147624656458579e-05, 'epoch': 1.18}


 59%|█████▉    | 1500/2547 [35:37<23:36,  1.35s/it]

{'loss': 0.1464, 'grad_norm': 4.771763801574707, 'learning_rate': 8.221436984687869e-06, 'epoch': 1.77}


                                                   
 67%|██████▋   | 1698/2547 [41:04<16:56,  1.20s/it]

{'eval_loss': 0.2352842390537262, 'eval_accuracy': {'accuracy': 0.953600711879489}, 'eval_f1': 0.9460097377846362, 'eval_runtime': 50.1798, 'eval_samples_per_second': 20.108, 'eval_steps_per_second': 2.531, 'epoch': 2.0}


 79%|███████▊  | 2000/2547 [47:51<12:26,  1.36s/it]  

{'loss': 0.1212, 'grad_norm': 1.446900486946106, 'learning_rate': 4.295249312917158e-06, 'epoch': 2.36}


 98%|█████████▊| 2500/2547 [59:22<01:03,  1.36s/it]

{'loss': 0.1062, 'grad_norm': 2.0176374912261963, 'learning_rate': 3.6906164114644683e-07, 'epoch': 2.94}


                                                     
100%|██████████| 2547/2547 [1:01:17<00:00,  1.44s/it]

{'eval_loss': 0.24378447234630585, 'eval_accuracy': {'accuracy': 0.9548083645839954}, 'eval_f1': 0.9485204388055916, 'eval_runtime': 50.5788, 'eval_samples_per_second': 19.949, 'eval_steps_per_second': 2.511, 'epoch': 3.0}
{'train_runtime': 3677.8938, 'train_samples_per_second': 5.537, 'train_steps_per_second': 0.693, 'train_loss': 0.16278490535315224, 'epoch': 3.0}





TrainOutput(global_step=2547, training_loss=0.16278490535315224, metrics={'train_runtime': 3677.8938, 'train_samples_per_second': 5.537, 'train_steps_per_second': 0.693, 'total_flos': 1330394083513344.0, 'train_loss': 0.16278490535315224, 'epoch': 3.0})

In [None]:
trainer_augmented.evaluate()