In [1]:
import json
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import random
import json
import nltk
from termcolor import colored
from datasets import load_dataset
from huggingface_hub import login
from generate_pii_dataset import generate_pii_dataset
from utils.pii_injection_utils import (
    get_pii_list,
    generate_data_transition,
    generate_text_transition,
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Model and Datasets

In [2]:
# Login to huggingface

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Load the model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct" 

model = LLM(model=model_id, gpu_memory_utilization=0.9, tensor_parallel_size=1)
tokenizer = AutoTokenizer.from_pretrained(model_id)

INFO 06-01 20:55:43 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-01 20:55:44 weight_utils.py:207] Using model weights format ['*.safetensors']


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

INFO 06-01 20:55:50 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-01 20:55:51 gpu_executor.py:83] # GPU blocks: 2191, # CPU blocks: 2048
INFO 06-01 20:55:51 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-01 20:55:51 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-01 20:55:54 model_runner.py:924] Graph capturing finished in 3 secs.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Generate or load PII samples
with open('grouped_pii_samples.json', 'r') as f:
    grouped_pii_samples = json.load(f)

grouped_pii_samples[0]

{'name': [],
 'email': ['ilpisug@zot.ci',
  'WAYNEvagjuv@ize.sl',
  'SKINNY@bibdivo.su',
  'Holt@ginsaufo.es',
  'Elvapattersonsahavi@ho.re'],
 'username': [],
 'id_number': ['6919978449503183',
  '019847530123598764',
  '13789',
  '78-47--2-2-5--32-96--5052836',
  '4797_4_350'],
 'phone_number': ['02 (320 537) 8935203', ' (823) 276 3 1564'],
 'street_address': [],
 'url': []}

In [5]:
# Load source texts
essay_dataset = load_dataset("qwedsacf/ivypanda-essays")
essay_dataset

DatasetDict({
    train: Dataset({
        features: ['TEXT', 'SOURCE', '__index_level_0__'],
        num_rows: 128293
    })
})

## Test PII Injection with a Sample

In [42]:
# This function inserts PII into the text
# Modify this function to change how the PII is inserted into the text
def generate_pii_text(model, tokenizer, sampling_params, text, pii_list, split_by_sentence=True, max_piis=None):
    splitted_text = nltk.sent_tokenize(text) if split_by_sentence else text.split(' ')

    for i, (pii, label) in enumerate(pii_list):
        if max_piis is not None and i >= max_piis:
            break

        pii_insert_index = random.randint(0, len(splitted_text))
        first_text = ' '.join(splitted_text[:pii_insert_index])
        second_text = ' '.join(splitted_text[pii_insert_index:])

        transition = generate_data_transition(model, tokenizer, sampling_params, first_text, data=pii, data_type=label)
        transition_before = transition
        first_text = f"{first_text} {colored(transition_before, 'green')} {pii}"

        transition = generate_text_transition(model, tokenizer, sampling_params, first_text=first_text, second_text=second_text)
        transition_after = transition

        splitted_text.insert(pii_insert_index, f"{colored(transition_before, 'green')} {colored(pii, 'blue')} {colored(transition_after, 'green')}")
    
    pii_text = ' '.join(splitted_text)

    return pii_text

In [31]:
# Select which sample and text to test
sample_index = 0
text_index = 0
max_piis = None # Set to None to inject all PIIs or set to a number to inject a maximum number of PIIs

In [32]:
# Set pii sample
pii_sample = grouped_pii_samples[sample_index]
pii_sample

{'name': [],
 'email': ['ilpisug@zot.ci',
  'WAYNEvagjuv@ize.sl',
  'SKINNY@bibdivo.su',
  'Holt@ginsaufo.es',
  'Elvapattersonsahavi@ho.re'],
 'username': [],
 'id_number': ['6919978449503183',
  '019847530123598764',
  '13789',
  '78-47--2-2-5--32-96--5052836',
  '4797_4_350'],
 'phone_number': ['02 (320 537) 8935203', ' (823) 276 3 1564'],
 'street_address': [],
 'url': []}

In [33]:
# Set text to be injected with PII
text = essay_dataset['train'][text_index]['TEXT']
print(text)

12 Years a Slave: An Analysis of the Film Essay

The 2013 film 12 Years a Slave proved that slavery is a worldwide issue. Indeed, the film made $150 million outside the United States and $57 million in the U.S., with a production budget of $20 million (Sharf, 2020). The movie was based on the memoir Twelve Years a Slave by Solomon Northup (Ntim, 2020). It tells the story of a free African American man who was kidnapped and sold into slavery. Solomon spent twelve years away from his family, being traded from one master to another. Fortunately, the protagonist met a person who helped him deliver a message to his family and friends, who came and rescued him. This movie accurately illustrates discriminatory relationships between white slaveholders and black slaves that stemmed from the dysfunctional system in the country and prejudices in people’s mindsets at that time.

The two main ethnic groups presented in this film are White and African Americans, and the three social groups are afflu

In [45]:
sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=2058, 
    skip_special_tokens=True,
    stop=[tokenizer.eos_token]
)

pii_list = get_pii_list(pii_sample)
print(pii_list)

pii_text = generate_pii_text(model, tokenizer, sampling_params, text, pii_list, split_by_sentence=False, max_piis=max_piis)
print()
print(pii_text)


[('ilpisug@zot.ci', 'email'), (' (823) 276 3 1564', 'phone_number'), ('Elvapattersonsahavi@ho.re', 'email'), ('WAYNEvagjuv@ize.sl', 'email'), ('78-47--2-2-5--32-96--5052836', 'id_number'), ('13789', 'id_number'), ('019847530123598764', 'id_number'), ('SKINNY@bibdivo.su', 'email'), ('6919978449503183', 'id_number'), ('02 (320 537) 8935203', 'phone_number'), ('Holt@ginsaufo.es', 'email'), ('4797_4_350', 'id_number')]


Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.48s/it, Generation Speed: 37.85 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:17<00:00, 17.23s/it, Generation Speed: 49.98 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.35s/it, Generation Speed: 48.71 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.05s/it, Generation Speed: 49.32 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it, Generation Speed: 44.83 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:20<00:00, 20.48s/it, Generation Speed: 48.84 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it, Generation Speed: 48.76 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.41s/it, Generation Speed: 48.68 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it, Generation Speed: 47.88 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.92s/it, Generation Speed: 50.91 toks/s]
Processed prompts: 100%|██████

Unexpected Error: {"text": "the three social groups are affluent slaveholders, working for middle class, and enslaved people. The movie starts with the story of a free African American violinist Solomon Northup, living with his family in Saratoga, New York (McQueen, 2013). However, he was abducted by two white men, who tortured the man and sold him into slavery, changing his name to Plat. Before they met, Solomon and these two slave traders belonged to [32mwho had an id number[0m [34m78-47--2-2-5--32-96--5052836[0m [32mwho were from the same middle class, but their racial biases and prejudices led them to treat Northup as a slave, despite their shared socioeconomic background.


Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.82s/it, Generation Speed: 50.31 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.62s/it, Generation Speed: 49.84 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it, Generation Speed: 50.17 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it, Generation Speed: 49.65 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.45s/it, Generation Speed: 50.62 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.40s/it, Generation Speed: 49.41 toks/s]


Unexpected Error: {"text": "nd the three social groups are affluent slaveholders, working for middle class, and enslaved people. The movie starts with the story of a free African American violinist Solomon Northup, living with his family in Saratoga, New York (McQueen, 2013). However, he was abducted by two white men, who tortured the man and sold him into slavery, changing his name to Plat. Before they met, Solomon and these two slave traders belonged to [32mwho had an id number[0m [34m78-47--2-2-5--32-96--5052836[0m [32mwho were from the same middle class, but their racial biases and prejudices led them to treat Northup as a slave, despite their shared socioeconomic background.


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it, Generation Speed: 49.89 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it, Generation Speed: 50.69 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.32s/it, Generation Speed: 49.92 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it, Generation Speed: 48.80 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.97s/it, Generation Speed: 50.16 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.63s/it, Generation Speed: 49.97 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.05s/it, Generation Speed: 49.04 toks/s]


12 Years a Slave: An Analysis of the Film Essay

The 2013 film 12 Years a Slave proved that slavery is a worldwide issue. Indeed, the film made $150 million outside the United States and $57 million in the U.S., with a production budget of $20 million (Sharf, 2020). The movie was based on the memoir Twelve Years a Slave by Solomon Northup (Ntim, 2020). It tells the story of a free African American man who was kidnapped and sold into slavery. Solomon spent twelve years away from his family, being traded from one master to another. Fortunately, the protagonist met a person who helped him deliver a message to [32mwho helped him deliver a message to his wife, and she was able to arrange his rescue[0m [34mHolt@ginsaufo.es[0m [32mFurthermore, the film's portrayal of the brutal realities of slavery and the struggles of its protagonist serves as a stark reminder of the ongoing impact of slavery on individuals and society, and the need to continue to confront and address the lingering eff




## Test Generate Dataset

In [6]:
output_dataset_name_path = "pii_dataset"
max_dataset_size = 10

terminators = [tokenizer.eos_token]

sampling_params = SamplingParams(
    temperature=0.6, 
    top_p=0.9, 
    max_tokens=2058, 
    skip_special_tokens=True,
    stop=terminators
)

# Generate PII Dataset
pii_dataset = generate_pii_dataset(model, tokenizer, sampling_params, essay_dataset['train']['TEXT'], grouped_pii_samples, output_dataset_name_path, max_dataset_size = max_dataset_size)

print(f"\nFinished Generating PII Dataset with {len(pii_dataset)} samples\n")


Processed prompts: 100%|██████████| 10/10 [00:03<00:00,  3.26it/s]
Processed prompts: 100%|██████████| 10/10 [00:18<00:00,  1.84s/it]
Processed prompts: 100%|██████████| 10/10 [00:07<00:00,  1.40it/s]
Processed prompts: 100%|██████████| 10/10 [00:19<00:00,  1.91s/it]
Processed prompts: 100%|██████████| 10/10 [00:04<00:00,  2.27it/s]
Processed prompts: 100%|██████████| 10/10 [00:29<00:00,  2.95s/it]
Processed prompts: 100%|██████████| 10/10 [00:16<00:00,  1.65s/it]
Processed prompts: 100%|██████████| 10/10 [00:32<00:00,  3.27s/it]
Processed prompts: 100%|██████████| 10/10 [00:23<00:00,  2.35s/it]
Processed prompts: 100%|██████████| 10/10 [00:26<00:00,  2.64s/it]
Processed prompts: 100%|██████████| 10/10 [00:08<00:00,  1.12it/s]
Processed prompts: 100%|██████████| 10/10 [00:25<00:00,  2.54s/it]
Processed prompts: 100%|██████████| 9/9 [00:08<00:00,  1.08it/s]
Processed prompts: 100%|██████████| 9/9 [00:27<00:00,  3.10s/it]
Processed prompts: 100%|██████████| 9/9 [00:21<00:00,  2.42s/it]
P

In [12]:
pii_dataset[0]

{'source_text': '12 Years a Slave: An Analysis of the Film Essay\n\nThe 2013 film 12 Years a Slave proved that slavery is a worldwide issue. Indeed, the film made $150 million outside the United States and $57 million in the U.S., with a production budget of $20 million (Sharf, 2020). The movie was based on the memoir Twelve Years a Slave by Solomon Northup (Ntim, 2020). It tells the story of a free African American man who was kidnapped and sold into slavery. Solomon spent twelve years away from his family, being traded from one master to another. Fortunately, the protagonist met a person who helped him deliver a message to his family and friends, who came and rescued him. This movie accurately illustrates discriminatory relationships between white slaveholders and black slaves that stemmed from the dysfunctional system in the country and prejudices in people’s mindsets at that time. The two main ethnic groups presented in this film are White and African Americans, and the three socia

In [11]:
# Print the first 5 samples
print("First 5 samples:")
for i in range(5):
    if i >= len(pii_dataset):
        break
    print(f"Sample {i+1}:")
    print(f"\nSource Text: {pii_dataset[i]['source_text']}")
    print(f"\nPII Text: {pii_dataset[i]['pii_text']}")
    print(f"\nPII Data: {pii_dataset[i]['pii_data']}")
    print(f"\nPII Labels: {pii_dataset[i]['pii_labels']}")
    print("\n")

In [9]:
# Print the first 5 samples
print("First 5 samples:")
for i in range(5):
    if i >= len(pii_dataset):
        break
    print(f"Sample {i+1}:")
    print(f"\nSource Text: {pii_dataset[i]['source_text']}")
    print(f"\nPII Text: {pii_dataset[i]['pii_text']}")
    print(f"\nPII Data: {pii_dataset[i]['pii_data']}")
    print(f"\nPII Labels: {pii_dataset[i]['pii_labels']}")
    print("\n")