In [2]:
from transformers import pipeline
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)

In [3]:
text = "My name is John and I live in California."
output = gen(text, aggregation_strategy="first")

In [4]:
print(output)

[{'entity_group': 'FIRSTNAME', 'score': 0.9546856, 'word': ' John', 'start': 10, 'end': 15}, {'entity_group': 'STATE', 'score': 0.98806274, 'word': ' California.', 'start': 29, 'end': 41}]


In [5]:
text = '''
Alice Johnson, a French citizen, visited the Eiffel Tower. She works at Google and lives in Paris. 
Her email is alice.johnson@example.com. She was born on February 14, 1990, and earns $7500 per month. 
You can call her at 555-987-6543 or visit her website at www.alicejohnson.com. 
The company recently launched a new product, the Google Pixel 5, which has received positive reviews. 
In the past, Alice attended the World Cup and read War and Peace by Leo Tolstoy. 
She speaks fluent English and French. Last year, she traveled to Mount Everest in Nepal. 
The event was scheduled on July 20, 2021, at 10:00 AM. Her net worth is estimated to be $1,000,000, and she has a 25% stake in the company.
She also has several measurements like a 5 kg bag of rice, and she finished first in the marathon.
'''
output = gen(text, aggregation_strategy="first")



In [7]:
for entity in output:
    print(entity)

{'entity_group': 'MIDDLENAME', 'score': 0.70211166, 'word': ' Johnson,', 'start': 6, 'end': 15}
{'entity_group': 'EMAIL', 'score': 0.9976675, 'word': ' alice.johnson@example.com.', 'start': 113, 'end': 140}
{'entity_group': 'DATE', 'score': 0.9964449, 'word': ' February 14, 1990,', 'start': 156, 'end': 175}
{'entity_group': 'CURRENCYSYMBOL', 'score': 0.91656756, 'word': ' $7500', 'start': 185, 'end': 191}
{'entity_group': 'PHONE_NUMBER', 'score': 0.9962837, 'word': ' 555-987-6543', 'start': 223, 'end': 236}
{'entity_group': 'URL', 'score': 0.8163411, 'word': ' www.alicejohnson.com.', 'start': 260, 'end': 282}
{'entity_group': 'FIRSTNAME', 'score': 0.9630687, 'word': ' Alice', 'start': 399, 'end': 405}
{'entity_group': 'DATE', 'score': 0.9988918, 'word': ' July 20, 2021,', 'start': 585, 'end': 600}
{'entity_group': 'TIME', 'score': 0.9992529, 'word': ' 10:00 AM.', 'start': 603, 'end': 613}
{'entity_group': 'CURRENCYSYMBOL', 'score': 0.96038735, 'word': ' $1,000,000,', 'start': 646, 'end

In [None]:
!pip install torchvision

### FineTuned Approach

In [None]:
!pip install transformers datasets

In [27]:
from transformers import pipeline

# Load the fine-tuned model
pii_detector = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def replace_pii(text):
    entities = pii_detector(text)
    replaced_text = text
    for entity in entities:
        if entity['entity_group'] in ["PER", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]:
            replaced_text = replaced_text.replace(entity['word'], f"[{entity['entity_group']}]")
    return replaced_text

# Example text
example_text = """
Alice Johnson, a French citizen, visited the Eiffel Tower. She works at Google and lives in Paris.
Her email is alice.johnson@example.com. She was born on February 14, 1990, and earns $7500 per month.
You can call her at 555-987-6543 or visit her website at www.alicejohnson.com.
The company recently launched a new product, the Google Pixel 5, which has received positive reviews.
In the past, Alice attended the World Cup and read War and Peace by Leo Tolstoy.
She speaks fluent English and French. Last year, she traveled to Mount Everest in Nepal.
The event was scheduled on July 20, 2021, at 10:00 AM. Her net worth is estimated to be $1,000,000, and she has a 25% stake in the company.
She also has several measurements like a 5 kg bag of rice, and she finished first in the marathon.
"""

# Replace PII in the example text
replaced_text = replace_pii(example_text)
print(replaced_text)


Alice Johnson, a French citizen, visited the Eiffel Tower. She works at Google and lives in Paris.
Her email is alice.johnson@example.com. She was born on[DATE], and earns $7500 per month.
You can call her at 555-987-6543 or visit her website at www.alicejohnson.com.
The company recently launched a new product, the Google Pixel 5, which has received positive reviews.
In the past, Alice attended the World Cup and read War and Peace by Leo Tolstoy.
She speaks fluent English and French. Last year, she traveled to Mount Everest in Nepal.
The event was scheduled on[DATE], at[TIME]. Her net worth is estimated to be $1,000,000, and she has a 25% stake in the company.
She also has several measurements like a 5 kg bag of rice, and she finished first in the marathon.



### Approach

In [14]:
# 1. Generate Synthetic Data

import csv
from faker import Faker
import random

# Initialize Faker
fake = Faker()

# Generate synthetic dataset
def generate_synthetic_data(num_samples=1000):
    data = []
    for _ in range(num_samples):
        text = f"{fake.name()} from {fake.country()} works at {fake.company()} and lives in {fake.city()}. " \
               f"They were born on {fake.date_of_birth()} and their email is {fake.email()}. " \
               f"They earn {fake.currency_symbol()}{fake.random_number(digits=5)} per year."
        labels = "O " * len(text.split())
        data.append((text, labels.strip()))
    return data

# Save dataset to CSV with UTF-8 encoding
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["text", "labels"])
        writer.writerows(data)

# Generate and save train and validation datasets
train_data = generate_synthetic_data(num_samples=800)
valid_data = generate_synthetic_data(num_samples=200)

save_to_csv(train_data, 'train.csv')
save_to_csv(valid_data, 'valid.csv')

print("Datasets generated and saved as 'train.csv' and 'valid.csv'.")

Datasets generated and saved as 'train.csv' and 'valid.csv'.


In [37]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
for data in ner_results:
    print(data)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'B-PER', 'score': 0.9990139, 'index': 4, 'word': 'Wolfgang', 'start': 11, 'end': 19}
{'entity': 'B-LOC', 'score': 0.999645, 'index': 9, 'word': 'Berlin', 'start': 34, 'end': 40}


In [38]:
sample_text = """
Alice Johnson, a French citizen, visited the Eiffel Tower. She works at Google and lives in Paris. 
Her email is alice.johnson@example.com. She was born on February 14, 1990, and earns $7500 per month. 
You can call her at 555-987-6543 or visit her website at www.alicejohnson.com. 
The company recently launched a new product, the Google Pixel 5, which has received positive reviews. 
In the past, Alice attended the World Cup and read War and Peace by Leo Tolstoy. 
She speaks fluent English and French. Last year, she traveled to Mount Everest in Nepal. 
The event was scheduled on July 20, 2021, at 10:00 AM. Her net worth is estimated to be $1,000,000, and she has a 25% stake in the company.
She also has several measurements like a 5 kg bag of rice, and she finished first in the marathon.
"""

In [40]:
# dslim/bert-base-NER
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = sample_text

ner_results = nlp(example)
for data in ner_results:
    print(data)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'B-PER', 'score': 0.99938476, 'index': 1, 'word': 'Alice', 'start': 1, 'end': 6}
{'entity': 'I-PER', 'score': 0.99947315, 'index': 2, 'word': 'Johnson', 'start': 7, 'end': 14}
{'entity': 'B-MISC', 'score': 0.9996563, 'index': 5, 'word': 'French', 'start': 18, 'end': 24}
{'entity': 'B-LOC', 'score': 0.987009, 'index': 10, 'word': 'E', 'start': 46, 'end': 47}
{'entity': 'I-LOC', 'score': 0.6012711, 'index': 11, 'word': '##iff', 'start': 47, 'end': 50}
{'entity': 'B-LOC', 'score': 0.85653263, 'index': 12, 'word': '##el', 'start': 50, 'end': 52}
{'entity': 'I-LOC', 'score': 0.9919612, 'index': 13, 'word': 'Tower', 'start': 53, 'end': 58}
{'entity': 'B-ORG', 'score': 0.9976587, 'index': 18, 'word': 'Google', 'start': 73, 'end': 79}
{'entity': 'B-LOC', 'score': 0.9995302, 'index': 22, 'word': 'Paris', 'start': 93, 'end': 98}
{'entity': 'B-MISC', 'score': 0.9975721, 'index': 93, 'word': 'Google', 'start': 333, 'end': 339}
{'entity': 'I-MISC', 'score': 0.9960896, 'index': 94, 'word'

In [None]:
# dslim/bert-base-NER
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = sample_text

ner_results = nlp(example)
for data in ner_results:
    print(data)

In [42]:
# lakshyakh93/deberta_finetuned_pii
from transformers import pipeline
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)

text = sample_text
output = gen(text, aggregation_strategy="first")
for data in output:
    print(data)

{'entity_group': 'MIDDLENAME', 'score': 0.70211166, 'word': ' Johnson,', 'start': 6, 'end': 15}
{'entity_group': 'EMAIL', 'score': 0.9976675, 'word': ' alice.johnson@example.com.', 'start': 113, 'end': 140}
{'entity_group': 'DATE', 'score': 0.9964449, 'word': ' February 14, 1990,', 'start': 156, 'end': 175}
{'entity_group': 'CURRENCYSYMBOL', 'score': 0.91656756, 'word': ' $7500', 'start': 185, 'end': 191}
{'entity_group': 'PHONE_NUMBER', 'score': 0.9962837, 'word': ' 555-987-6543', 'start': 223, 'end': 236}
{'entity_group': 'URL', 'score': 0.8163411, 'word': ' www.alicejohnson.com.', 'start': 260, 'end': 282}
{'entity_group': 'FIRSTNAME', 'score': 0.9630687, 'word': ' Alice', 'start': 399, 'end': 405}
{'entity_group': 'DATE', 'score': 0.9988918, 'word': ' July 20, 2021,', 'start': 585, 'end': 600}
{'entity_group': 'TIME', 'score': 0.9992529, 'word': ' 10:00 AM.', 'start': 603, 'end': 613}
{'entity_group': 'CURRENCYSYMBOL', 'score': 0.96038735, 'word': ' $1,000,000,', 'start': 646, 'end



In [1]:
text = """ArithmeticErrorJohn Doe, born on January 1st, 1980, resides at 123 Main Street, Anytown, USA. He works at XYZ Corporation as a software engineer. His email address is john.doe@example.com and his phone number is +1 (555) 123-4567. 

Jane Smith, born on March 15th, 1995, lives at 456 Elm Street, Springfield, USA. She is a student at ABC University, majoring in computer science. Her email is jane.smith@example.com and her phone number is +1 (555) 987-6543.

Michael Johnson, born on November 10th, 1978, lives at 789 Oak Avenue, Smalltown, USA. He works as a doctor at City Hospital. His email address is michael.johnson@example.com and his phone number is +1 (555) 222-3333."""

In [2]:
# lakshyakh93/deberta_finetuned_pii
from transformers import pipeline
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)
output = gen(text, aggregation_strategy="first")
for data in output:
    print(data)

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


{'entity_group': 'DATE', 'score': 0.9976619, 'word': ' January 1st, 1980,', 'start': 32, 'end': 51}
{'entity_group': 'STREETADDRESS', 'score': 0.98643786, 'word': ' 123 Main Street,', 'start': 62, 'end': 79}
{'entity_group': 'STATE', 'score': 0.8421379, 'word': ' Anytown,', 'start': 79, 'end': 88}
{'entity_group': 'COMPANY_NAME', 'score': 0.733382, 'word': ' XYZ', 'start': 105, 'end': 109}
{'entity_group': 'JOBTYPE', 'score': 0.5441987, 'word': ' engineer.', 'start': 135, 'end': 145}
{'entity_group': 'EMAIL', 'score': 0.9892954, 'word': ' john.doe@example.com', 'start': 166, 'end': 187}
{'entity_group': 'PHONE_NUMBER', 'score': 0.9046054, 'word': ' +1 (555) 123-4567.', 'start': 211, 'end': 230}
{'entity_group': 'MIDDLENAME', 'score': 0.55161256, 'word': ' Smith,', 'start': 237, 'end': 244}
{'entity_group': 'DATE', 'score': 0.9980107, 'word': ' March 15th, 1995,', 'start': 252, 'end': 270}
{'entity_group': 'STREETADDRESS', 'score': 0.9916169, 'word': ' 456 Elm Street,', 'start': 279, 'e



In [44]:
# lakshyakh93/deberta_finetuned_pii
from transformers import pipeline
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)

text = sample_text
output = gen(text, aggregation_strategy="first")
for data in output:
    print(data['word'])

 Johnson,
 alice.johnson@example.com.
 February 14, 1990,
 $7500
 555-987-6543
 www.alicejohnson.com.
 Alice
 July 20, 2021,
 10:00 AM.
 $1,000,000,




In [48]:
from transformers import pipeline
gen = pipeline("token-classification", "lakshyakh93/deberta_finetuned_pii", device=-1)
dir(gen)

['__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_args_parser',
 '_basic_tokenizer',
 '_batch_size',
 '_ensure_tensor_on_device',
 '_forward',
 '_forward_params',
 '_num_workers',
 '_postprocess_params',
 '_preprocess_params',
 '_sanitize_parameters',
 'aggregate',
 'aggregate_overlapping_entities',
 'aggregate_word',
 'aggregate_words',
 'binary_output',
 'call_count',
 'check_model_type',
 'default_input_names',
 'device',
 'device_placement',
 'ensure_tensor_on_device',
 'feature_extractor',
 'forward',
 'framework',
 'gather_pre_entities',
 'get_inference_context',
 'get_iterator',
 '

In [59]:
gen

<transformers.pipelines.token_classification.TokenClassificationPipeline at 0x23703eb3e90>

In [47]:
from transformers import DebertaTokenizerFast

# Load the DeBERTa tokenizer
tokenizer = DebertaTokenizerFast.from_pretrained("lakshyakh93/deberta_finetuned_pii")

# Get the special tokens
special_tokens = tokenizer.special_tokens_map_extended

# Filter out the entity types
entity_types = [token for token, token_type in special_tokens.items()]

# Print the entity types
print("Entity Types:")
for entity_type in entity_types:
    print(entity_type)

Entity Types:
bos_token
eos_token
unk_token
sep_token
pad_token
cls_token
mask_token


In [None]:
[
    [0, 5, 'John ', ' John'],
    [4, 9, ' Doe,', ' Doe,'],
    [17, 36, ' January 1st, 1980,', '[[DATE]]'],
    [47, 64, ' 123 Main Street,', '[[STREETADDRESS]]'],
    [64, 73, ' Anytown,', '[[STATE]]'],
    [151, 172, ' john.doe@example.com', '[[EMAIL]]'],
    [196, 215, ' +1 (555) 123-4567.', '[[PHONE_NUMBER]]']
]

[
    [1, 5, 'ohn ', 'John'],
    [5, 9, 'Doe,', 'Doe,'],
    [18, 36, 'January 1st, 1980,', '[[DATE]]'],
    [48, 64, '123 Main Street,', '[[STREETADDRESS]]'],
    [65, 73, 'Anytown,', '[[STATE]]'],
    [152, 172, 'john.doe@example.com', '[[EMAIL]]'],
    [197, 215, '+1 (555) 123-4567.', '[[PHONE_NUMBER]]']
]

In [None]:
[
    [
        '[[PERSON]], born on January 1st, 1980, resides at 123 Main Street, Anytown, USA. He works at XYZ Corporation as a software engineer. His email address is [[PERSON]] and his phone number is +1 (555) 123-4567. ',
        []
    ],
    ['John Doe, born on [[DATE]] resides at [[STREETADDRESS]] [[STATE]] USA. He works at XYZ Corporation as a software engineer. His email address is [[EMAIL]] and his phone number is [[PHONE_NUMBER]] ',
        [
            [0, 4, 'John', 'John'],
            [5, 9, 'Doe,', 'Doe,']
        ]
    ]
]

In [None]:
front space =  True
back space =  False
original = ' John'
entity = ' John'
front space =  True
back space =  False
original = ' Doe,'
entity = ' Doe,'
front space =  True
back space =  False
original = ' January 1st, 1980,'
entity = ' January 1st, 1980,'
front space =  True
back space =  False
original = ' 123 Main Street,'
entity = ' 123 Main Street,'
front space =  True
back space =  False
original = ' Anytown,'
entity = ' Anytown,'
front space =  True
back space =  False
original = ' john.doe@example.com'
entity = ' john.doe@example.com'
front space =  True
back space =  False
original = ' +1 (555) 123-4567.'
entity = ' +1 (555) 123-4567.'

Replacement
[
    [1, 6, 'ohn D', ' John'],
    [5, 10, 'Doe, ', ' Doe,'],
    [18, 37, 'January 1st, 1980, ', '[[DATE]]'],
    [48, 65, '123 Main Street, ', '[[STREETADDRESS]]'],
    [65, 74, 'Anytown, ', '[[STATE]]'],
    [152, 173, 'john.doe@example.com ', '[[EMAIL]]'],
    [197, 216, '+1 (555) 123-4567. ', '[[PHONE_NUMBER]]']
]