In [None]:
!pip install datasets
!pip install seqeval
!pip install -U accelerate
!pip install -U transformers
!pip install -U accelerate
!pip install -U transformers

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

import torch
import json
import numpy as np
import pandas as pd
import nltk
import random
import itertools
import collections
import datasets
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

In [6]:
df1 = pd.read_json("data/english_balanced_10k.jsonl", lines=True)
df2 = pd.read_json("data/PII43k_original.jsonl", lines=True)

In [7]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df = df.rename(columns={
    "Masked text" : "target_text",
    "Unmasked text": "source_text",
    "Tokenised Masked text": "tokenized_text",
    "Tokensised Unmasked text": "ner_tags", 
}) 
# df = df.drop_duplicates()
df = df.dropna().reset_index(drop=True)

In [None]:
df

In [9]:
nt = df.ner_tags.tolist()
nt = list(itertools.chain.from_iterable(nt)) 
nt = collections.Counter(nt) 
all_labels = list(nt.keys()) 

In [10]:
source_texts = df.source_text.tolist()
target_texts = df.target_text.tolist()
tokenized_texts = df.tokenized_text.tolist()
ner_tags = df.ner_tags.tolist()

In [None]:
len(all_labels)

In [None]:
source_texts[0]

In [None]:
target_texts[0]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") 

In [None]:
## checking if the tokens align
i = random.randint(0, len(source_texts))
x0 = tokenizer.convert_ids_to_tokens(tokenizer(source_texts[i])['input_ids'])
x0.pop(0)  # CLS is not present in the dataset
for t in zip(x0, tokenized_texts[i]):
    print(t)

In [None]:
# Checking if the tokens align with tags
# i = random.randint(0, len(target_texts))
for t in zip(ner_tags[i], tokenized_texts[i]):
    print(t)

In [None]:
# Create label dict
label2id = dict([(value,key) for key, value in enumerate(all_labels)])
id2label = dict(map(reversed, label2id.items()))

label2id, id2label

In [None]:
for j in tqdm(range(len(ner_tags))):
    tags = ner_tags[j]
    for i in range(len(tags)):
        for k,v in label2id.items():
            if tags[i] == k:
                tags[i] = v
    ner_tags[j] = tags
df.ner_tags = ner_tags

In [None]:
ner_tags[0]

In [20]:
df[['source_words']] = "source_words"
source_words = [text.split(" ") for text in source_texts]
df.source_words = source_words

In [21]:
# removing rows where the len(tokenized_texts[i]) does not match len(ner_tags[i])
idx = [i for i in range(len(ner_tags)) if len(tokenized_texts[i]) != len(ner_tags[i])]
df = df.drop(index=idx).reset_index(drop=True)

In [None]:
dataset = datasets.Dataset.from_pandas(df)
dataset

In [23]:
def align_labels(example):
    tokenized_input = tokenizer(example["tokenized_text"], is_split_into_words=True)
    word_ids = tokenized_input.word_ids()
    aligned_labels = [-100 if i is None else example["ner_tags"][i] for i in word_ids]
    tokenized_input['labels'] = aligned_labels
    return tokenized_input

In [None]:
al = align_labels(dataset[0])
print(len(al['input_ids']), len(al['attention_mask']), len(al['labels']))

In [25]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokenized_text"], is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
x = dataset.map(align_labels, num_proc=8, remove_columns=dataset.column_names)

In [27]:
tokenized_dataset = x.train_test_split(test_size=0.2)

In [None]:
tokenized_dataset

In [29]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = datasets.load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }

    for k in results.keys():
        if (k not in flattened_results.keys()):
            flattened_results[f"{k}_f1"] = results[k]["f1"]

    return flattened_results

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(all_labels), label2id=label2id, id2label=id2label)
print(model.config)

In [None]:
import torch
torch.cuda.is_available()

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
args = TrainingArguments(
    output_dir="distilbert_finetuned_ai4privacy",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    overwrite_output_dir=True,
    warmup_ratio=0.2,
    weight_decay=0.01,
    save_strategy='epoch',
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model.to(device),  # Make sure the model is on the GPU
    args=args,
    train_dataset=tokenized_dataset["test"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Train the model
train_result = trainer.train()

# Evaluate the model on the test set
test_result = trainer.evaluate(tokenized_dataset['test'])

# Save the model to a local directory
trainer.save_model("model_dir") # Specify the directory where you want to save the model

# Get the training and evaluation metrics
train_metrics = train_result.metrics
test_metrics = test_result.metrics

# Calculate the number of training and evaluation samples
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

# Add the number of samples to the metrics
train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))

# Print the training and evaluation metrics
print("Train metrics:", train_metrics)
print("Eval metrics:", test_metrics)

In [None]:
train_result = trainer.train()
test_result = trainer.evaluate(tokenized_dataset['test'])

train_metrics = train_result.metrics
test_metrics = test_result.metrics

max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
trainer.log_metrics("train", train_metrics)

test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))
trainer.log_metrics("eval", test_metrics)

trainer.save_metrics("train", train_metrics)
trainer.save_metrics("eval", test_metrics)

trainer.save_state()
trainer.save_model(args.output_dir)

In [None]:
# Load the saved model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(args.output_dir)

# Prepare the input data
input_data = tokenizer(
    "A students assessment was found on device bearing IMEI: 06-184755-866851-3.",
    return_tensors="pt",
    add_special_tokens=True,
    padding=True,
    truncation=True
)

# Set the pad token ID in the config object
model.config.pad_token_id = tokenizer.pad_token_id

# Perform inference
outputs = model(input_data)

# Analyze the results
predictions = outputs.logits.argmax(dim=-1)

# Print the prediction
print(f"Predicted class: {predictions[0].item()}")

In [None]:
from transformers import BertForMaskedLM, BertTokenizer

# Load the model and tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Input text with a masked token
input_text = "My name is [MASK]."

# Tokenize the input text
input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")

# Generate predictions
with torch.no_grad():
    outputs = model(input_ids)
    predictions = outputs.logits

# Print the predicted probabilities for the masked token
predicted_probabilities = predictions[0, input_ids[0].tolist().index(tokenizer.mask_token_id)].softmax(dim=-1)
print("Predicted probabilities:", predicted_probabilities)

# Get the top predicted tokens
top_k = 5
top_k_tokens = torch.topk(predicted_probabilities, top_k).indices.tolist()

# Convert token IDs to tokens
predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_tokens)

# Print the top predicted tokens
print("Top predicted tokens:", predicted_tokens)


In [39]:
test_metrics = test_result

In [40]:
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

In [None]:
train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))

# Print the training and evaluation metrics
print("Train metrics:", train_metrics)
print("Eval metrics:", test_metrics)


In [42]:
trainer.save_metrics("train", train_metrics)
trainer.save_metrics("eval", test_metrics)

trainer.save_state()
trainer.save_model(args.output_dir)

In [None]:
# Define the output directory where you want to save the model
output_dir = "model_output/"

# Save the model's weights and architecture
model.save_pretrained(output_dir)

# Save the tokenizer as well if needed
tokenizer.save_pretrained(output_dir)


In [None]:
# Train the model
train_result = trainer.train()

# Evaluate the model on the test set
test_result = trainer.evaluate(tokenized_dataset['test'])

# Get the training and evaluation metrics
train_metrics = train_result.metrics
test_metrics = test_result  # No need to access `.metrics` attribute here

# Calculate the number of training and evaluation samples
max_train_samples = len(tokenized_dataset['train'])
max_eval_samples = len(tokenized_dataset['test'])

# Add the number of samples to the metrics
train_metrics["train_samples"] = min(max_train_samples, len(tokenized_dataset['train']))
test_metrics["eval_samples"] = min(max_eval_samples, len(tokenized_dataset['test']))

# Print the training and evaluation metrics
print("Train metrics:", train_metrics)
print("Eval metrics:", test_metrics)


In [None]:
from transformers import pipeline
gen = pipeline("token-classification", "Isotonic/distilbert_finetuned_ai4privacy", device=-1)
text = "My name is Clara and I live in Berkeley, California."
output = gen(text, aggregation_strategy="simple")
output
def replace_entities(output, text):
    word_to_entity_group = dict(
    (text[token["start"] : token["end"]], token["entity_group"]) for token in output
)
    for i, token in enumerate(output):
        word = list(word_to_entity_group.keys())[i]
        text = text.replace(word, f"[{word_to_entity_group[word]}]")

    return text
replace_entities(output, text)

In [None]:
from transformers import BertForTokenClassification, BertTokenizer

# Load the custom model and tokenizer
model_dir = "model_dir/"
model = BertForTokenClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

# Define the function to replace entities
def replace_entities(output, text):
    word_to_entity_group = dict(
        (text[token["start"] : token["end"]], token["entity_group"]) for token in output
    )
    for i, token in enumerate(output):
        word = list(word_to_entity_group.keys())[i]
        text = text.replace(word, f"[{word_to_entity_group[word]}]")
    return text

# Use the model to perform token classification
text = "My name is Clara and I live in Berkeley, California."
output = gen(text, aggregation_strategy="simple")

# Replace entities in the text using the custom model
replaced_text = replace_entities(output, text)
print(replaced_text)
