In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv
/kaggle/input/pii-detection-removal-from-educational-data/train.json
/kaggle/input/pii-detection-removal-from-educational-data/test.json


In [2]:
!pip install -U transformers
!pip install -U datasets
!pip install seqeval evaluate -q

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.0
    Uninstalling transformers-4.44.0:
      Successfully uninstalled transformers-4.44.0
Successfully installed transformers-4.44.2
Collecting accelerate
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempti

In [3]:
import json
train=json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/train.json'))
test=json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))

In [4]:
def process_data(data):
  all_texts=[]
  all_labels=[]

  for doc in data:
    tokens=doc['tokens']
    labels = doc.get('labels',['O']*len(tokens)) #Use 'O' for missing labels
    all_texts.append(tokens)
    all_labels.append(labels)

  return all_texts,all_labels

train_texts, train_labels = process_data(train)
test_texts, test_labels = process_data(test)

In [5]:
from datasets import Dataset

# Convert to huggingface dataset format
train_dataset = Dataset.from_dict({'tokens': train_texts, 'ner_tags': train_labels})
test_dataset = Dataset.from_dict({'tokens': test_texts})

In [6]:
print(train[0].keys())
for key in train[0].keys():
    value = train[0][key]
    # Check if the value is a list or a string
    if isinstance(value, (list, str)):
        print(f"{key}: {value[:5]}")

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])
full_text: Desig
tokens: ['Design', 'Thinking', 'for', 'innovation', 'reflexion']
trailing_whitespace: [True, True, True, True, False]
labels: ['O', 'O', 'O', 'O', 'O']


In [7]:
train[0]['full_text']

"Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  It is fun and encourages exchanges\n\n•  It makes visi

In [8]:
from itertools import chain

all_labels = sorted(list(set(chain(*[x["labels"] for x in train]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [item for item in all_labels if item != 'O']

id2label

{0: 'B-EMAIL',
 1: 'B-ID_NUM',
 2: 'B-NAME_STUDENT',
 3: 'B-PHONE_NUM',
 4: 'B-STREET_ADDRESS',
 5: 'B-URL_PERSONAL',
 6: 'B-USERNAME',
 7: 'I-ID_NUM',
 8: 'I-NAME_STUDENT',
 9: 'I-PHONE_NUM',
 10: 'I-STREET_ADDRESS',
 11: 'I-URL_PERSONAL',
 12: 'O'}

In [9]:
def align_labels_with_tokens(labels, word_ids):
    aligned_labels = []
    previous_word_id = None
    for word_id in word_ids:
        if word_id is None:
            aligned_labels.append(-100)
        elif word_id != previous_word_id:
            aligned_labels.append(labels[word_id])
        else:
            aligned_labels.append(-100)
        previous_word_id = word_id
    return aligned_labels

In [10]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
model = AutoModelForTokenClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=len(all_labels))


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
test_dataset.column_names

['tokens']

In [12]:
# Tokenize and align the labels
def tokenize_and_align_labels(examples, labels_available=True, max_length=256):
    # Tokenize the inputs with truncation, padding, and a max_length
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',  # Ensures padding to max_length
        max_length=max_length  # Set max_length to prevent overly long sequences
    )

    if labels_available:
        all_labels = examples['ner_tags']
        new_labels = []

        # Iterate over each set of labels
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            aligned_labels = align_labels_with_tokens(labels, word_ids)

            # Print debug information to help troubleshoot length mismatches
            if len(aligned_labels) != len(word_ids):
                print(f"Length mismatch at index {i}: Tokens length = {len(word_ids)}, Labels length = {len(aligned_labels)}")

            new_labels.append(aligned_labels)

        # Map labels to their respective IDs (make sure to include all label types)
        label_to_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4, -100: -100}  # Extend as needed
        new_labels = [[label_to_id.get(label, -100) for label in label_list] for label_list in new_labels]

        tokenized_inputs["labels"] = new_labels

    return tokenized_inputs

In [13]:
# Apply the tokenization and label alignment for the training set (with labels)
train_dataset = train_dataset.map(lambda x: tokenize_and_align_labels(x, labels_available=True), batched=True)

# Apply the tokenization for the test set (without labels)
test_dataset = test_dataset.map(lambda x: tokenize_and_align_labels(x, labels_available=False), batched=True)


Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [14]:
from seqeval.metrics import accuracy_score, f1_score, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Remove ignored index (special tokens like -100)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_preds = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    return {
        "accuracy": accuracy_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "classification_report": classification_report(true_labels, true_preds)
    }

In [15]:
from transformers import Trainer, TrainingArguments


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision
    gradient_accumulation_steps=4
)



In [16]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:
import torch
torch.cuda.empty_cache()
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
0,No log,No log


TrainOutput(global_step=212, training_loss=0.06464113829270848, metrics={'train_runtime': 795.6044, 'train_samples_per_second': 8.556, 'train_steps_per_second': 0.266, 'total_flos': 3150312945942528.0, 'train_loss': 0.06464113829270848, 'epoch': 0.9964747356051704})

In [18]:
from transformers import pipeline

# Load NER pipeline with trained model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example of using the pipeline
text = "John Doe's email is john.doe@example.com and his phone number is 123-456-7890."
ner_results = ner_pipeline(text)

print(ner_results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'LABEL_0', 'score': 0.99993885, 'word': "John Doe's email is john.doe@example.com and his phone number is 123-456-7890.", 'start': 0, 'end': 78}]
