In [1]:
%pip install -q -U transformers datasets accelerate peft trl bitsandbytes
%pip install einops

[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install flash-attn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import os
import re
import torch
from spacy.lang.en import English
from datasets import (load_dataset, Dataset)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### Set the enviornment as Hugging Face Token
# os.environ["HF_TOKEN"] = ""

# Select Models
base_model = "meta-llama/Meta-Llama-3-8B"
new_model = "llama3-8b-4bit-pii"
final_model_name = new_model + "-merged"
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token=tokenizer.eos_token


nlp = English()
english_tokenizer = nlp.tokenizer

pii_labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
pii_labels_pattern = '|'.join(pii_labels)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def extract_personal_formation(text):
    names, emails, usernames, id_numbers, phone_numbers, urls, street_addresses = [], [], [], [], [], [], []
    outputs = re.split(r',?\n', text)
    outputs = [re.sub(r'((^\d+\.)|(\*))\s*', '', output) for output in outputs]
    for output in outputs:
        result = re.search(f"[^)(\s]+\s?\(({pii_labels_pattern})\)", output)
        if not result: continue

        data = output.replace(f"({result.group(1)})", "").strip()
        label = result.group(1)
        if label == "NAME_STUDENT":
            names.append(data)
        elif label == "EMAIL":
            emails.append(data)
        elif label == "USERNAME":
            usernames.append(data)
        elif label == "ID_NUM":
            id_numbers.append(data)
        elif label == "PHONE_NUM":
            phone_numbers.append(data)
        elif label == "URL_PERSONAL":
            urls.append(data)
        elif label == "STREET_ADDRESS":
            street_addresses.append(data)
    
    return (names, emails, usernames, id_numbers, phone_numbers, urls, street_addresses)

def format_pii_output(text):
    if not text: return ""
    
    names, emails, usernames, id_numbers, phone_numbers, urls, street_addresses = extract_personal_formation(text)
    names = "\n".join([f"* {name}" for name in names]) if names else "* None"
    emails = "\n".join([f"* {email}" for email in emails]) if emails else "* None"
    usernames = "\n".join([f"* {username}" for username in usernames]) if usernames else "* None"
    id_numbers = "\n".join([f"* {id_number}" for id_number in id_numbers]) if id_numbers else "* None"
    phone_numbers = "\n".join([f"* {phone_number}" for phone_number in phone_numbers]) if phone_numbers else "* None"
    urls = "\n".join([f"* {url}" for url in urls]) if urls else "* None"
    street_addresses = "\n".join([f"* {street_address}" for street_address in street_addresses]) if street_addresses else "* None"

    return f'''List of Names:
{names}

List of Emails:
{emails}

List of Usernames:
{usernames}

List of ID Numbers:
{id_numbers}

List of Phone Numbers:
{phone_numbers}

List of URLs:
{urls}

List of Street Addresses:
{street_addresses}'''

def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer = format_pii_output(answer)
    answer += "\n" + tokenizer.eos_token

  return f'''<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Your task is to identify and list any personal information in the input text. Please, list the following personal information.

List of Names:
* first full or partial name of a person

List of Emails:
* email address of a person

List of Usernames:
* username of a person

List of ID Numbers:
* number or sequence of characters that could be used to identify a person like the student ID or a social security number

List of Phone Numbers:
* phone number associated with a person

List of URLs:
* URL that might be used to identify a person

List of Street Addresses:
* full or partial street address that is associated with the person, such as a home address

### Input:
Text: {text}

### Response:
{answer}'''

## Loading Alpaca

In [4]:
alpaca_prompt = '''<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}'''

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


In [5]:
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
alapca_train_dataset = dataset.map(formatting_prompts_func, batched = True,)
alapca_train_dataset = pd.DataFrame(alapca_train_dataset)
alapca_train_dataset

Unnamed: 0,output,input,instruction,text
0,1. Eat a balanced and nutritious diet: Make su...,,Give three tips for staying healthy.,<|begin_of_text|>Below is an instruction that ...
1,"The three primary colors are red, blue, and ye...",,What are the three primary colors?,<|begin_of_text|>Below is an instruction that ...
2,An atom is the basic building block of all mat...,,Describe the structure of an atom.,<|begin_of_text|>Below is an instruction that ...
3,There are several ways to reduce air pollution...,,How can we reduce air pollution?,<|begin_of_text|>Below is an instruction that ...
4,I had to make a difficult decision when I was ...,,Pretend you are a project manager of a constru...,<|begin_of_text|>Below is an instruction that ...
...,...,...,...,...
51755,Yes,Text: John went out for a walk with his dog Ro...,You will be given a piece of text about an eve...,<|begin_of_text|>Below is an instruction that ...
51756,True,Text: Michael Jordan is an American former pro...,You will be given a paragraph of text with var...,<|begin_of_text|>Below is an instruction that ...
51757,True,Text: A tree fell over in the wind and caused ...,You will be given a piece of text about an eve...,<|begin_of_text|>Below is an instruction that ...
51758,Backwards,"Steps: ['She takes out her books', 'The teache...",I will give you a list of steps. You need to ...,<|begin_of_text|>Below is an instruction that ...


## Load Custom PII Dataset

In [6]:
def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                elif '#' in token:
                    new_token = token.replace('#','')
                    text += new_token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif len(temp_label_buffer) > 0 and label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append(temp_label_buffer)

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

def retokenize(mod_30k_data):
  nlp = English()
  # Create a Tokenizer with the default settings for English
  # including punctuation rules and exceptions
  tokenizer = nlp.tokenizer

  for k in range(len(mod_30k_data)):
    new_tokens = []
    tokens = tokenizer(mod_30k_data[k]['source_text'])
    labels = ['O'] * len(tokens)

    for i in range(len(labels)):
      for label in mod_30k_data[k]['privacy_mask']:
        if str(tokens[i]) in label['value']:
          labels[i] = 'I-'+label['label']
      new_tokens.append(str(tokens[i]))

    mod_30k_data[k]['labels'] = labels
    mod_30k_data[k]['tokens'] = new_tokens

  return mod_30k_data

def replace_strings(lst, old_string, new_string):
    return [new_string if item == old_string else item for item in lst]

def label_replace(mod_30k_data):
  old_label = ['LASTNAME1','LASTNAME2','SOCIALNUMBER','TEL','DRIVERLICENSE','STREET','BUILDING','PASSPORT','GIVENNAME1','GIVENNAME2','LASTNAME3','STATE','POSTCODE','CITY','IDCARD','IP']
  new_label = ['NAME_STUDENT','NAME_STUDENT','ID_NUM','PHONE_NUM','ID_NUM','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','NAME_STUDENT','NAME_STUDENT','NAME_STUDENT','STREET_ADDRESS','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','URL_PERSONAL']

  address_builder = ''

  for i in range(len(mod_30k_data)):
    for k in range(len(mod_30k_data[i]['privacy_mask'])):
      for t in range(len(new_label)):
        if mod_30k_data[i]['privacy_mask'][k]['label'] == old_label[t]:
          if mod_30k_data[i]['privacy_mask'][k]['label'] == 'IP' or mod_30k_data[i]['privacy_mask'][k]['label']=='URL_PERSONAL':
            mod_30k_data[i]['privacy_mask'][k]['value'] = get_new_url()

          mod_30k_data[i]['privacy_mask'][k]['label'] = new_label[t]
          continue

  return mod_30k_data

def bert_300_gen(documents):
  labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
  temp_output_arr = []
  temp_output = ''


  for t in range(len(documents)):
    for i in range(len(documents[t]['privacy_mask'])):
      if documents[t]['privacy_mask'][i]['label'] in labels:
        if documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n' not in temp_output_arr:
          temp_output_arr.append(documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n')

    for strings in temp_output_arr:
      temp_output+=strings

    documents[t]['output'] = temp_output
    documents[t]['text'] = format_prompt(documents[t]['source_text'],temp_output)
    documents[t]['testing_text'] = format_prompt(documents[t]['source_text'],'')
    documents[t]['full_text'] = documents[t]['source_text']


    temp_output_arr = []
    temp_output = ''

  return documents

In [7]:
import pandas as pd
data_path = "30k_english_instruction.json"
jsonObj = pd.read_json(path_or_buf=data_path, lines=True)

mod_30k_data = jsonObj.to_dict('records')
mod_30k_data = mod_30k_data[0]
mod_30k_data = label_replace(mod_30k_data)
data = bert_300_gen(mod_30k_data)

In [8]:
mod_30k_data = list(mod_30k_data.values())
custom_pii_dataset_train = pd.DataFrame(mod_30k_data)
custom_pii_dataset_train

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set,output,final_text,testing_text,text,full_text
0,Subject: Group Messaging for Admissions Proces...,Subject: Group Messaging for Admissions Proces...,"[{'value': 'wynqvrh053', 'start': 287, 'end': ...","[[440, 453, ""USERNAME""], [430, 437, ""TIME""], [...","[Sub, ##ject, :, Group, Mess, ##aging, for, Ad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40767A,English,train,wynqvrh053 (USERNAME)\nluka.burg (USERNAME)\nq...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,Subject: Group Messaging for Admissions Proces...
1,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...,- Meeting at [TIME]\n- [USERNAME] - Meeting at...,"[{'value': '2:33 PM', 'start': 13, 'end': 20, ...","[[74, 81, ""TIME""], [50, 60, ""USERNAME""], [40, ...","[-, Meeting, at, 2, :, 33, PM, -, N, ##23, -, ...","[O, O, O, B-TIME, I-TIME, I-TIME, I-TIME, O, O...",40767B,English,train,N23 (USERNAME)\nwennmann27 (USERNAME)\n,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...
2,Subject: Admission Notification - Great Britai...,Subject: Admission Notification - Great Britai...,"[{'value': '5:24am', 'start': 263, 'end': 269,...","[[395, 407, ""SOCIALNUMBER""], [358, 375, ""EMAIL...","[Sub, ##ject, :, Ad, ##mission, Not, ##ificati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40768A,English,train,Balloi (NAME_STUDENT)\nEckrich (NAME_STUDENT)\...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,Subject: Admission Notification - Great Britai...
3,Card: KB90324ER\n Country: GB\n Building: ...,Card: [IDCARD]\n Country: [COUNTRY]\n Buil...,"[{'value': 'KB90324ER', 'start': 6, 'end': 15,...","[[390, 393, ""STATE""], [368, 378, ""CITY""], [346...","[Card, :, KB, ##90, ##32, ##4, ##ER, \, n, Cou...","[O, O, B-IDCARD, I-IDCARD, I-IDCARD, I-IDCARD,...",40768B,English,train,KB90324ER (ID_NUM)\n163 (STREET_ADDRESS)\nCony...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,Card: KB90324ER\n Country: GB\n Building: ...
4,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so...","N, WA14 5RW\n Password: [PASS]\n\n...and so ...","[{'value': 'r]iD1#8', 'start': 26, 'end': 33, ...","[[336, 352, ""DATE""], [26, 33, ""PASS""]]","[N, ,, W, ##A, ##14, 5, ##R, ##W, \, n, Pass, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PASS...",40768C,English,train,,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29903,e of Birth: [18/01/1962]\n - Passport Number...,e of Birth: [[BOD]]\n - Passport Number: [[P...,"[{'value': '18/01/1962', 'start': 13, 'end': 2...","[[379, 384, ""POSTCODE""], [360, 361, ""TIME""], [...","[e, of, Birth, :, [, 18, /, 01, /, 1962, ], -,...","[O, O, O, O, O, B-BOD, I-BOD, I-BOD, I-BOD, I-...",53616B,English,train,MI (ID_NUM)\n031412682 (ID_NUM)\n350804398 (ID...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,e of Birth: [18/01/1962]\n - Passport Number...
29904,"2022, in New York City. The assessment include...","2022, in New York City. The assessment include...",[],[],"[2022, ,, in, New, York, City, ., The, assessm...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",53616C,English,train,,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,"2022, in New York City. The assessment include..."
29905,Art Therapy Code of Ethics:\n\nNotice to all a...,Art Therapy Code of Ethics:\n\nNotice to all a...,"[{'value': '281586425', 'start': 410, 'end': 4...","[[410, 419, ""IDCARD""]]","[Art, Therapy, Code, of, Ethics, :, Notice, to...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",53617A,English,train,281586425 (ID_NUM)\n,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,Art Therapy Code of Ethics:\n\nNotice to all a...
29906,"72330716, 8015553273660, N60324048, 743010413,...","72330716, [IDCARD], [IDCARD], [IDCARD], [IDCAR...","[{'value': '8015553273660', 'start': 10, 'end'...","[[380, 397, ""TEL""], [363, 378, ""TEL""], [350, 3...","[723, ##30, ##71, ##6, ,, 801, ##55, ##53, ##2...","[O, O, O, O, O, B-IDCARD, I-IDCARD, I-IDCARD, ...",53617B,English,train,8015553273660 (ID_NUM)\nN60324048 (ID_NUM)\n74...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|>Below is an instruction that ...,<|begin_of_text|>Below is an instruction that ...,"72330716, 8015553273660, N60324048, 743010413,..."


## Load PII Dataset

In [9]:
def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append(temp_label_buffer)

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            output = output.strip()
            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

In [10]:
train_data_path = "train.json"
test_data_path = "test.json"
num_test_data = 13
test_data_percentage = 0.002 # needs to be > 0

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)

with open(test_data_path ) as file:
    test_data_json = json.load(file)

test_data_size = int(len(train_data_json) * test_data_percentage)
print("Training Data: ", len(train_data_json))
print("Test Data: ", len(test_data_json))

# TODO: Only using the first samples as test data, which are only for predicting names. Need to include the other types of information in our data as well, and also add data from the other PII dataset into the testing data
training_data, testing_data = extract_training_data_from_documents(train_data_json[test_data_size:], 1000)

# Convert the list of dictionaries into a DataFrame
pii_dataset_train = pd.DataFrame(training_data)
pii_dataset_train

Training Data:  6807
Test Data:  10


Unnamed: 0,text,output,full_text
0,<|begin_of_text|>Below is an instruction that ...,Nathalie Sylla (NAME_STUDENT),Design Thinking for innovation reflexion - Av...
1,<|begin_of_text|>Below is an instruction that ...,Diego Estrada (NAME_STUDENT),Diego Estrada \n\n Design Thinking Assignment...
2,<|begin_of_text|>Below is an instruction that ...,Gilberto Gamboa (NAME_STUDENT),Reporting process \n\n by Gilberto Gamboa \n\...
3,<|begin_of_text|>Below is an instruction that ...,Sindy Samaca (NAME_STUDENT),Design Thinking for Innovation \n\n Sindy Sam...
4,<|begin_of_text|>Below is an instruction that ...,,", I \n\n would like to use the learning launch..."
...,...,...,...
7952,<|begin_of_text|>Below is an instruction that ...,,EXAMPLE – JOURNEY MAP \n\n THE CHALLENGE ...
7953,<|begin_of_text|>Below is an instruction that ...,,Why Mind Mapping? \n\n Mind maps are graphica...
7954,<|begin_of_text|>Below is an instruction that ...,,"Challenge \n\n So, a few months back, I had c..."
7955,<|begin_of_text|>Below is an instruction that ...,,Brainstorming \n\n Challenge & Selection \n\n...


## Label Distribution

In [11]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in pii_dataset_train['output']:
    num_labels[label] += output.count(label)

print("PII Occurrances:\n")
num_labels

PII Occurrances:



{'NAME_STUDENT': 988,
 'EMAIL': 26,
 'USERNAME': 5,
 'ID_NUM': 67,
 'PHONE_NUM': 4,
 'URL_PERSONAL': 97,
 'STREET_ADDRESS': 2}

In [12]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in custom_pii_dataset_train['output']:
    num_labels[label] += output.count(label)

print("Custom PII Occurrances:\n")
num_labels

Custom PII Occurrances:



{'NAME_STUDENT': 21559,
 'EMAIL': 9716,
 'USERNAME': 10867,
 'ID_NUM': 36661,
 'PHONE_NUM': 7262,
 'URL_PERSONAL': 8145,
 'STREET_ADDRESS': 36159}

## Merge Datasets

In [13]:
dataset_train = pd.concat([alapca_train_dataset, pii_dataset_train[-1000:-1], pii_dataset_train[500:2000], pii_dataset_train[int(len(pii_dataset_train) * 0.5):int(len(pii_dataset_train) * 0.5) + 2000], custom_pii_dataset_train])
dataset_train = Dataset.from_pandas(dataset_train)
dataset_train

Dataset({
    features: ['output', 'input', 'instruction', 'text', 'full_text', 'source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set', 'final_text', 'testing_text', '__index_level_0__'],
    num_rows: 86167
})

In [14]:
print(dataset_train['text'][70000])

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Your task is to identify and list any personal information in the input text. Please, list the following personal information.

List of Names:
* first full or partial name of a person

List of Emails:
* email address of a person

List of Usernames:
* username of a person

List of ID Numbers:
* number or sequence of characters that could be used to identify a person like the student ID or a social security number

List of Phone Numbers:
* phone number associated with a person

List of URLs:
* URL that might be used to identify a person

List of Street Addresses:
* full or partial street address that is associated with the person, such as a home address

### Input:
Text: <html>
<head>
<title>Neurocognitive Testing Results</title>
</head>
<body>
<h1>Individual Test Results</h1>
<h2>Participant: Maâr

## Model Training

In [15]:
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.1,
    r=256,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"
                , "down_proj", "lm_head"],
    bias="none",
    task_type="CAUSAL_LM",
)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0},
    use_auth_token=True,
    use_flash_attention_2=True, # use flash attention 2
)


model.config.use_cache = True

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]


In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
9.709 GB of memory reserved.


In [17]:
# Set training arguments
training_arguments = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 20,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
)


# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    dataset_text_field = "text",
    peft_config=peft_config,
    dataset_num_proc = 2,
    max_seq_length=2048,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    packing = False,
    args=training_arguments,
)

Map (num_proc=2): 100%|██████████| 86167/86167 [00:20<00:00, 4165.84 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [18]:
tokenizer.pad_token=tokenizer.eos_token
# tokenizer.padding_side="left"

In [19]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
1,2.1837
2,1.7537
3,1.8052
4,1.7815
5,1.4599
6,1.168
7,1.2808
8,0.9304
9,1.2683
10,0.8853




In [20]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()

47997

In [21]:
###Merge the Base Model with the Trained Adapter
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

#Reload the Base Model and load the QLoRA adapters
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.05it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
model.save_pretrained(final_model_name)
tokenizer.save_pretrained(final_model_name)

('llama3-8b-4bit-pii-merged/tokenizer_config.json',
 'llama3-8b-4bit-pii-merged/special_tokens_map.json',
 'llama3-8b-4bit-pii-merged/tokenizer.json')

In [23]:
del model

## vLLM Test Model

### Load Model

In [29]:
%pip install spacy
%pip install kaleido python-multipart typing-extensions
%pip install vllm
%pip install huggingface_hub
%pip install flash-attn
%pip install bitsandbytes

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [24]:
from transformers import AutoTokenizer
from spacy.lang.en import English
from huggingface_hub import login
from vllm import LLM, SamplingParams
import transformers
import torch
import os
import json
import re

2024-05-09 03:46:33,095	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [25]:
llm = LLM(model=final_model_name, gpu_memory_utilization=0.95, max_context_len_to_capture=2048)

INFO 05-09 03:46:33 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='llama3-8b-4bit-pii-merged', tokenizer='llama3-8b-4bit-pii-merged', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-09 03:46:33 selector.py:16] Using FlashAttention backend.
INFO 05-09 03:46:35 model_runner.py:104] Loading model weights took 14.9595 GB
INFO 05-09 03:46:36 gpu_executor.py:94] # GPU blocks: 3140, # CPU blocks: 2048
INFO 05-09 03:46:36 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-09 03:46:36 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-09 03:46:47 model_runner.py:867] Graph capturing finished in 10 secs.


### Load Dataset

In [26]:
def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append(temp_label_buffer)

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            output = output.strip()
            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

In [27]:
test_data_path = "train.json"
test_data_percentage = 0.002
# Loading Dataset
with open(test_data_path) as file:
    test_data_json = json.load(file)
    print("Testing Data: ", len(test_data_json))

# Limiting the data for testing
test_data_size = int(len(test_data_json) * test_data_percentage)
print("Test Data Size: ", test_data_size)

test_data = test_data_json[:test_data_size]
training_data, testing_data = extract_training_data_from_documents(test_data, 400)

# Convert the list of dictionaries into a DataFrame
pii_dataset_test = pd.DataFrame(training_data)
pii_dataset_test


Testing Data:  6807
Test Data Size:  13


Unnamed: 0,text,output,full_text
0,<|begin_of_text|>Below is an instruction that ...,Nathalie Sylla (NAME_STUDENT),Design Thinking for innovation reflexion - Av...
1,<|begin_of_text|>Below is an instruction that ...,Nathalie Sylla (NAME_STUDENT),"of questions, we can use : who, what, when, ..."
2,<|begin_of_text|>Below is an instruction that ...,Diego Estrada (NAME_STUDENT),Diego Estrada \n\n Design Thinking Assignment...
3,<|begin_of_text|>Below is an instruction that ...,Diego Estrada (NAME_STUDENT),which eases the work of the team. \n\n Insig...
4,<|begin_of_text|>Below is an instruction that ...,Gilberto Gamboa (NAME_STUDENT),Reporting process \n\n by Gilberto Gamboa \n\...
5,<|begin_of_text|>Below is an instruction that ...,,ask for a post - meeting review of the new fo...
6,<|begin_of_text|>Below is an instruction that ...,Sindy Samaca (NAME_STUDENT),Design Thinking for Innovation \n\n Sindy Sam...
7,<|begin_of_text|>Below is an instruction that ...,,the two investors. The first thing I did with...
8,<|begin_of_text|>Below is an instruction that ...,,"small changes, which \n\n were being introduc..."
9,<|begin_of_text|>Below is an instruction that ...,Nadine Born (NAME_STUDENT),Assignment : Visualization Reflection ...


### Test One Sample

In [28]:
# Test Input
input_text = pii_dataset_test['full_text'][0]
# input_text = "Javier Rosa recently moved to 22233 Escalante Run and can be reached at 290-828-2909 for inquiries about the community book club."
print(input_text)

 Design Thinking for innovation reflexion - Avril 2021 - Nathalie Sylla 

 Challenge & selection 

 The tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map. 

 What exactly is a mind map? According to the definition of Buzan T. and Buzan B.( 1999, Dessine - moi  l'intelligence. Paris : Les Éditions d'Organisation.), the mind map( or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain 's  potential to be released. Cf Annex1 

 This tool has many advantages : 

 •  It is accessible to all and does not require significant material investment and can be done  quickly 

 •  It is scalable 

 •  It allows categorization and linking of information 

 •  It can be applied to any type of situation : notetaking, problem solving, analysis, creation of  new ideas 

 •  It is suitable for all people and is easy to learn 

 •  It is fun and encourages exchanges 

 •  It 

In [29]:
print(format_prompt(input_text))

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Your task is to identify and list any personal information in the input text. Please, list the following personal information.

List of Names:
* first full or partial name of a person

List of Emails:
* email address of a person

List of Usernames:
* username of a person

List of ID Numbers:
* number or sequence of characters that could be used to identify a person like the student ID or a social security number

List of Phone Numbers:
* phone number associated with a person

List of URLs:
* URL that might be used to identify a person

List of Street Addresses:
* full or partial street address that is associated with the person, such as a home address

### Input:
Text:  Design Thinking for innovation reflexion - Avril 2021 - Nathalie Sylla 

 Challenge & selection 

 The tool I use to help all st

In [30]:
# Setting Hyperparameters
sampling_params = SamplingParams(temperature=0, max_tokens=512)

# Testing Model
outputs = llm.generate(
    # [format_prompt(input_text[:200] + ". Javier Rosa recently moved to 22233 Escalante Run and can be reached at 290-828-2909 for inquiries about the community book club. Anna Smith and Jessica is here as well." + input_text[50:])],
    [format_prompt(input_text)],
    sampling_params
    )

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print()
    print("Generated text:\n", generated_text)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


Generated text:
 List of Names:
* Nathalie

List of Emails:
* None

List of Usernames:
* None

List of ID Numbers:
* None

List of Phone Numbers:
* None

List of URLs:
* None

List of Street Addresses:
* None






##

### Test All Evaluation Samples

In [31]:
pii_categories = ['Names', 'Emails', 'Usernames', 'ID Numbers', 'Phone Numbers', 'URLs', 'Street Addresses']
pii_categories_pattern = '|'.join(pii_categories)
category_to_label = {'Names': 'NAME_STUDENT', 'Emails': 'EMAIL', 'Usernames': 'USERNAME', 'ID Numbers': 'ID_NUM', 'Phone Numbers': 'PHONE_NUM', 'URLs': 'URL_PERSONAL', 'Street Addresses': 'STREET_ADDRESS'}


In [32]:
# Convert new output format to previous output
def convert_output_format(text):
    outputs = []
    pii_outputs = re.findall(f"({pii_categories_pattern}):\n((\*.+?\n)+)", text, re.DOTALL)
    for category, data, _ in pii_outputs:
        data = data.replace("* ", "").strip().split("\n")

        for item in data:
            if item == 'None': continue
            outputs.append(f"{item} ({category_to_label[category]})")

    return outputs

In [33]:
nlp = English()
english_tokenizer = nlp.tokenizer

In [34]:
# Predict
model_outputs = llm.generate(
    [format_prompt(input) for input in pii_dataset_test['full_text']],
    sampling_params
)

Processed prompts: 100%|██████████| 34/34 [03:56<00:00,  6.97s/it]  


In [42]:
total_classifications = 0
num_correct_classifications = 0
num_hallucinated = 0
# Evaluate
for id, (input_text, expected_output, model_output, expected_text) in enumerate(zip(pii_dataset_test['full_text'], pii_dataset_test['output'], model_outputs, pii_dataset_test['text'])):
    expected_output_list = re.split(r',?\n', expected_output)
    expected_output_list = [re.sub(r'((^\d+\.)|(\*))\s*', '', output) for output in expected_output_list] # Remove "* " or "1. " if values starts with them
    expected_output_list_lowercased = [re.sub(r'((^\d+\.)|(\*))\s*', '', output).lower().strip() for output in expected_output_list] # Remove "* " or "1. " if values starts with them
    generated_text = model_output.outputs[0].text

    # Process output text
    outputs: list[str] = convert_output_format(generated_text)
    print("Expected Outputs:\n", expected_output_list)
    print("Outputs:\n", outputs)
    print()

    # Total Classifications should be equal to the list of expected outputs
    total_classifications += len(expected_output_list)

    for output in outputs:
        if output.lower().strip() in expected_output_list_lowercased:
            num_correct_classifications += 1
        else:
            num_hallucinated += 1

# TODO: Calculate test accuracy with Rogue-L metric: rogue(pii_dataset_test['text'][i].replace(format_prompt(input)), model_output.outputs[i].text) 

print("Accuracy:", round(num_correct_classifications / total_classifications, 3))
print("Number Hallucinated:", num_hallucinated)

Expected Outputs:
 ['Nathalie Sylla (NAME_STUDENT)']
Outputs:
 ['Nathalie (NAME_STUDENT)']

Expected Outputs:
 ['Nathalie Sylla (NAME_STUDENT)']
Outputs:
 ['Nathalie Sylla (NAME_STUDENT)']

Expected Outputs:
 ['Diego Estrada (NAME_STUDENT)']
Outputs:
 ['Diego Estrada (NAME_STUDENT)']

Expected Outputs:
 ['Diego Estrada (NAME_STUDENT)']
Outputs:
 ['Diego Estrada (NAME_STUDENT)']

Expected Outputs:
 ['Gilberto Gamboa (NAME_STUDENT)']
Outputs:
 ['Gilberto Gamboa (NAME_STUDENT)']

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['Sindy Samaca (NAME_STUDENT)']
Outputs:
 ['Sindy Samaca (NAME_STUDENT)']

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['Nadine Born (NAME_STUDENT)']
Outputs:
 ['Nadine (NAME_STUDENT)']

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['']
Outputs:
 []

Expected Outputs:
 ['Eladio Amaya (NAME_STUDENT)']
Outputs:
 ['Eladio A