In [1]:
%pip install -q -U transformers datasets accelerate peft trl bitsandbytes
%pip install einops

[0mNote: you may need to restart the kernel to use updated packages.
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
%pip install flash-attn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd

### Set the enviornment as Hugging Face Token
os.environ["HF_TOKEN"] = "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"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model = "meta-llama/Meta-Llama-3-8B"
#Fine-tune model name
new_model = "llama2-pii"
#Load the Dataset from hugging face
# dataset = load_dataset("sahil2801/CodeAlpaca-20k",split="train")
#Tokenizer
#Load the tokenizer from Llama 2
tokenizer = AutoTokenizer.from_pretrained(base_model)
#In Llama2 we dont have the padding token which is a very big problem, because we have a dataset with different number of tokens in each row.
#So, we need to pad it so they all have the same length and here i am using end of sentence token and this will have an impact on the generation of our model
#I am using End of Sentence token for fine-tuning
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [34]:
tokenizer.eos_token

'<|end_of_text|>'

In [35]:
'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}'''

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{}'

## Loading Alpaca

In [36]:
# alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""
alpaca_prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
{}

### Input:
{}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{}'''

EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset

Dataset({
    features: ['output', 'input', 'instruction'],
    num_rows: 51760
})

In [37]:
# train_dataset = dataset.select([i for i in range(len(dataset)) if i not in set(train_indexes)])
alapca_train_dataset = dataset.map(formatting_prompts_func, batched = True,)
alapca_train_dataset

Map: 100%|██████████| 51760/51760 [00:00<00:00, 158315.21 examples/s]


Dataset({
    features: ['output', 'input', 'instruction', 'text'],
    num_rows: 51760
})

In [38]:
alapca_train_dataset = pd.DataFrame(alapca_train_dataset)

In [39]:
alapca_train_dataset

Unnamed: 0,output,input,instruction,text
0,1. Eat a balanced and nutritious diet: Make su...,,Give three tips for staying healthy.,<|begin_of_text|><|start_header_id|>system<|en...
1,"The three primary colors are red, blue, and ye...",,What are the three primary colors?,<|begin_of_text|><|start_header_id|>system<|en...
2,An atom is the basic building block of all mat...,,Describe the structure of an atom.,<|begin_of_text|><|start_header_id|>system<|en...
3,There are several ways to reduce air pollution...,,How can we reduce air pollution?,<|begin_of_text|><|start_header_id|>system<|en...
4,I had to make a difficult decision when I was ...,,Pretend you are a project manager of a constru...,<|begin_of_text|><|start_header_id|>system<|en...
...,...,...,...,...
51755,Yes,Text: John went out for a walk with his dog Ro...,You will be given a piece of text about an eve...,<|begin_of_text|><|start_header_id|>system<|en...
51756,True,Text: Michael Jordan is an American former pro...,You will be given a paragraph of text with var...,<|begin_of_text|><|start_header_id|>system<|en...
51757,True,Text: A tree fell over in the wind and caused ...,You will be given a piece of text about an eve...,<|begin_of_text|><|start_header_id|>system<|en...
51758,Backwards,"Steps: ['She takes out her books', 'The teache...",I will give you a list of steps. You need to ...,<|begin_of_text|><|start_header_id|>system<|en...


## Load Custom PII Dataset

In [40]:
tokenizer.eos_token

'<|end_of_text|>'

In [41]:
tokenizer.bos_token

'<|begin_of_text|>'

In [42]:
import json
# from datasets import Dataset
import pandas as pd

def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.


### Input:
{text}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}'''
#   return f'''<|begin_of_text|><|start_header_id|>user<|end_header_id|> Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

# The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
# The email address of a student (EMAIL),
# The username of a student on any platform (USERNAME),
# A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
# A phone number associated with a student (PHONE_NUM),
# A URL that might be used to identify a student (URL_PERSONAL),
# A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

# You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
# If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.

# ### Input:
# {text}

# ### Response:
# [/INST] {answer}'''
def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                elif '#' in token:
                    new_token = token.replace('#','')
                    text += new_token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif len(temp_label_buffer) > 0 and label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append(temp_label_buffer)

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

def retokenize(mod_30k_data):
  nlp = English()
  # Create a Tokenizer with the default settings for English
  # including punctuation rules and exceptions
  tokenizer = nlp.tokenizer

  for k in range(len(mod_30k_data)):
    new_tokens = []
    tokens = tokenizer(mod_30k_data[k]['source_text'])
    labels = ['O'] * len(tokens)

    # for i in range(len(labels)):
    #   for label in mod_30k_data[k]['privacy_mask']:
    #     if str(tokens[i]) in label['value'] and len(str(tokens[i]))>1:
    #       labels[i] = 'I-'+label['label']
    #   new_tokens.append(str(tokens[i]))

    for i in range(len(labels)):
      for label in mod_30k_data[k]['privacy_mask']:
        if str(tokens[i]) in label['value']:
          labels[i] = 'I-'+label['label']
      new_tokens.append(str(tokens[i]))

    mod_30k_data[k]['labels'] = labels
    mod_30k_data[k]['tokens'] = new_tokens

  return mod_30k_data

def get_new_url(name = None):
  if name == None:
    name = random.randint(10,70)
  s = random.choice(['','s'])
  garbage = random.randint(5000,1000000)
  format = random.randint(0,3)

  if format == 0:
    return f'http{s}://www.linkedin.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 1:
    return f'http{s}://www.instagram.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 2:
    return f'http{s}://www.youtube.com/'+str(name)+f'/{str(garbage)}/'
  elif format == 3:
    return f'http{s}://www.twitter.com/'+str(name)+f'/{str(garbage)}/'

def replace_strings(lst, old_string, new_string):
    return [new_string if item == old_string else item for item in lst]

def label_replace(mod_30k_data):
  old_label = ['LASTNAME1','LASTNAME2','SOCIALNUMBER','TEL','DRIVERLICENSE','STREET','BUILDING','PASSPORT','GIVENNAME1','GIVENNAME2','LASTNAME3','STATE','POSTCODE','CITY','IDCARD','IP']
  new_label = ['NAME_STUDENT','NAME_STUDENT','ID_NUM','PHONE_NUM','ID_NUM','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','NAME_STUDENT','NAME_STUDENT','NAME_STUDENT','STREET_ADDRESS','STREET_ADDRESS','STREET_ADDRESS','ID_NUM','URL_PERSONAL']

  address_builder = ''

  for i in range(len(mod_30k_data)):
    for k in range(len(mod_30k_data[i]['privacy_mask'])):
      for t in range(len(new_label)):
        if mod_30k_data[i]['privacy_mask'][k]['label'] == old_label[t]:
          if mod_30k_data[i]['privacy_mask'][k]['label'] == 'IP' or mod_30k_data[i]['privacy_mask'][k]['label']=='URL_PERSONAL':
            mod_30k_data[i]['privacy_mask'][k]['value'] = get_new_url()
          # if mod_30k_data[i]['privacy_mask'][k]['label'] == 'STREET_ADDRESS':

          # elif len(address_builder) != '':
          #     address_builder = ''

          mod_30k_data[i]['privacy_mask'][k]['label'] = new_label[t]
          continue

  return mod_30k_data

def bert_300_gen(documents):
  labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
  temp_output_arr = []
  temp_output = ''
  # temp_final_output =''


  for t in range(len(documents)):
    # print(document['privacy_mask'][0]['label'])
    for i in range(len(documents[t]['privacy_mask'])):
      if documents[t]['privacy_mask'][i]['label'] in labels:
        if documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n' not in temp_output_arr:
          temp_output_arr.append(documents[t]['privacy_mask'][i]['value']+' ('+documents[t]['privacy_mask'][i]['label']+')\n')

    for strings in temp_output_arr:
      temp_output+=strings

    # temp_final_output = format_prompt(documents[t]['source_text'],temp_output)

    documents[t]['output'] = temp_output
    documents[t]['text'] = format_prompt(documents[t]['source_text'],temp_output)
    documents[t]['testing_text'] = format_prompt(documents[t]['source_text'],'')
    documents[t]['full_text'] = documents[t]['source_text']


    temp_output_arr = []
    temp_output = ''
    # temp_final_output =''
    # document['output_arr'] = temp_output

  return documents



In [43]:
import pandas as pd
data_path = "30k_english_instruction.json"
jsonObj = pd.read_json(path_or_buf=data_path, lines=True)

mod_30k_data = jsonObj.to_dict('records')
mod_30k_data = mod_30k_data[0]
mod_30k_data = label_replace(mod_30k_data)
data = bert_300_gen(mod_30k_data)

In [44]:
mod_30k_data[0].keys()

dict_keys(['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set', 'output', 'final_text', 'testing_text', 'text', 'full_text'])

In [45]:
mod_30k_data[0]['full_text']

'Subject: Group Messaging for Admissions Process\n\nGood morning, everyone,\n\nI hope this message finds you well. As we continue our admissions processes, I would like to update you on the latest developments and key information. Please find below the timeline for our upcoming meetings:\n\n- wynqvrh053 - Meeting at 10:20am\n- luka.burg - Meeting at 21\n- qahil.wittauer - Meeting at quarter past 13\n- gholamhossein.ruschke - Meeting at 9:47 PM\n- pdmjrsyoz1460 '

In [46]:
mod_30k_data[0]['text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n### Instruction:\nYou are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:\n\nThe full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),\nThe email address of a student (EMAIL),\nThe username of a student on any platform (USERNAME),\nA number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),\nA phone number associated with a student (PHONE_NUM),\nA URL that might be used to identify a stu

In [47]:
mod_30k_data = list(mod_30k_data.values())

In [48]:
from datasets import Dataset

# Convert the list of dictionaries into a DataFrame
custom_pii_dataset_train = pd.DataFrame(mod_30k_data)

# Convert the DataFrame into a HuggingFace Dataset
# custom_pii_dataset_train = Dataset.from_pandas(df)

In [49]:
custom_pii_dataset_train

Unnamed: 0,source_text,target_text,privacy_mask,span_labels,mbert_text_tokens,mbert_bio_labels,id,language,set,output,final_text,testing_text,text,full_text
0,Subject: Group Messaging for Admissions Proces...,Subject: Group Messaging for Admissions Proces...,"[{'value': 'wynqvrh053', 'start': 287, 'end': ...","[[440, 453, ""USERNAME""], [430, 437, ""TIME""], [...","[Sub, ##ject, :, Group, Mess, ##aging, for, Ad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40767A,English,train,wynqvrh053 (USERNAME)\nluka.burg (USERNAME)\nq...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,Subject: Group Messaging for Admissions Proces...
1,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...,- Meeting at [TIME]\n- [USERNAME] - Meeting at...,"[{'value': '2:33 PM', 'start': 13, 'end': 20, ...","[[74, 81, ""TIME""], [50, 60, ""USERNAME""], [40, ...","[-, Meeting, at, 2, :, 33, PM, -, N, ##23, -, ...","[O, O, O, B-TIME, I-TIME, I-TIME, I-TIME, O, O...",40767B,English,train,N23 (USERNAME)\nwennmann27 (USERNAME)\n,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,- Meeting at 2:33 PM\n- N23 - Meeting at 11:29...
2,Subject: Admission Notification - Great Britai...,Subject: Admission Notification - Great Britai...,"[{'value': '5:24am', 'start': 263, 'end': 269,...","[[395, 407, ""SOCIALNUMBER""], [358, 375, ""EMAIL...","[Sub, ##ject, :, Ad, ##mission, Not, ##ificati...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",40768A,English,train,Balloi (NAME_STUDENT)\nEckrich (NAME_STUDENT)\...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,Subject: Admission Notification - Great Britai...
3,Card: KB90324ER\n Country: GB\n Building: ...,Card: [IDCARD]\n Country: [COUNTRY]\n Buil...,"[{'value': 'KB90324ER', 'start': 6, 'end': 15,...","[[390, 393, ""STATE""], [368, 378, ""CITY""], [346...","[Card, :, KB, ##90, ##32, ##4, ##ER, \, n, Cou...","[O, O, B-IDCARD, I-IDCARD, I-IDCARD, I-IDCARD,...",40768B,English,train,KB90324ER (ID_NUM)\n163 (STREET_ADDRESS)\nCony...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,Card: KB90324ER\n Country: GB\n Building: ...
4,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so...","N, WA14 5RW\n Password: [PASS]\n\n...and so ...","[{'value': 'r]iD1#8', 'start': 26, 'end': 33, ...","[[336, 352, ""DATE""], [26, 33, ""PASS""]]","[N, ,, W, ##A, ##14, 5, ##R, ##W, \, n, Pass, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PASS...",40768C,English,train,,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,"N, WA14 5RW\n Password: r]iD1#8\n\n...and so..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29903,e of Birth: [18/01/1962]\n - Passport Number...,e of Birth: [[BOD]]\n - Passport Number: [[P...,"[{'value': '18/01/1962', 'start': 13, 'end': 2...","[[379, 384, ""POSTCODE""], [360, 361, ""TIME""], [...","[e, of, Birth, :, [, 18, /, 01, /, 1962, ], -,...","[O, O, O, O, O, B-BOD, I-BOD, I-BOD, I-BOD, I-...",53616B,English,train,MI (ID_NUM)\n031412682 (ID_NUM)\n350804398 (ID...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,e of Birth: [18/01/1962]\n - Passport Number...
29904,"2022, in New York City. The assessment include...","2022, in New York City. The assessment include...",[],[],"[2022, ,, in, New, York, City, ., The, assessm...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",53616C,English,train,,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,"2022, in New York City. The assessment include..."
29905,Art Therapy Code of Ethics:\n\nNotice to all a...,Art Therapy Code of Ethics:\n\nNotice to all a...,"[{'value': '281586425', 'start': 410, 'end': 4...","[[410, 419, ""IDCARD""]]","[Art, Therapy, Code, of, Ethics, :, Notice, to...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",53617A,English,train,281586425 (ID_NUM)\n,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,Art Therapy Code of Ethics:\n\nNotice to all a...
29906,"72330716, 8015553273660, N60324048, 743010413,...","72330716, [IDCARD], [IDCARD], [IDCARD], [IDCAR...","[{'value': '8015553273660', 'start': 10, 'end'...","[[380, 397, ""TEL""], [363, 378, ""TEL""], [350, 3...","[723, ##30, ##71, ##6, ,, 801, ##55, ##53, ##2...","[O, O, O, O, O, B-IDCARD, I-IDCARD, I-IDCARD, ...",53617B,English,train,8015553273660 (ID_NUM)\nN60324048 (ID_NUM)\n74...,<s>[INST] <<SYS>>\nYou are an intelligent assi...,<|begin_of_text|><|start_header_id|>system<|en...,<|begin_of_text|><|start_header_id|>system<|en...,"72330716, 8015553273660, N60324048, 743010413,..."


## Load PII Dataset

In [50]:
import json
from datasets import Dataset
import pandas as pd


def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.


### Input:
{text}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}'''

def extract_training_data_from_documents(documents, window_size=10000):
    temp_label_buffer = ''
    labels = ['NAME_STUDENT', 'EMAIL', 'ID_NUM', 'USERNAME', 'URL_PERSONAL', 'STREET_ADDRESS', 'PHONE_NUM']
    data_finetune_full = []
    data_finetune_no_answer = []

    for document in documents:
        tokens = document['tokens']
        labels_per_token = document['labels']

        token_chunks = [tokens[i:i + window_size] for i in range(0, len(tokens), window_size)]
        label_chunks = [labels_per_token[i:i + window_size] for i in range(0, len(labels_per_token), window_size)]

        for chunk_index in range(len(token_chunks)):
            text = ''
            output = ''
            answers = []

            for token_index in range(len(token_chunks[chunk_index])):
                token = token_chunks[chunk_index][token_index]
                label = label_chunks[chunk_index][token_index]

                if token in ['.', ',', '!', '?', "'", '(', ')', ' ']:
                    text += token
                else:
                    text += ' ' + token

                for label_type in labels:
                    if label == 'B-' + label_type or label == 'I-' + label_type:
                        if len(temp_label_buffer) > 1:
                            temp_label_buffer += (' ' + token)
                        else:
                            temp_label_buffer += token
                    elif label == 'O' and (label_chunks[chunk_index][token_index - 1] == 'B-' + label_type or
                                            label_chunks[chunk_index][token_index - 1] == 'I-' + label_type):
                        temp_label_buffer += (' ' + f'({label_type})')
                        if temp_label_buffer not in answers:
                            answers.append(temp_label_buffer)

                if label == 'O':
                    temp_label_buffer = ''

            for answer in answers:
                output += answer + '\n'

            output = output.strip()
            data_finetune_full.append({'text': format_prompt(text, output), 'output': output, 'full_text': text})
            data_finetune_no_answer.append({'text': format_prompt(text, ''), 'output': output, 'full_text': text})

    return data_finetune_full, data_finetune_no_answer

train_data_path = "train.json"
test_data_path = "test.json"
num_test_Data = 13
# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

training_data, testing_data = extract_training_data_from_documents(train_data_json[num_test_Data:], 400)
# Convert the list of dictionaries into a DataFrame
pii_dataset_train = pd.DataFrame(training_data)

# Convert the DataFrame into a HuggingFace Dataset
# pii_dataset_train = Dataset.from_pandas(df)

Training Data:  6807
Test Data:  10


In [51]:
pii_dataset_train

Unnamed: 0,text,output,full_text
0,<|begin_of_text|><|start_header_id|>system<|en...,Fareed Ponce (NAME_STUDENT),Reflection – Visualization \n\n Fareed Ponce ...
1,<|begin_of_text|><|start_header_id|>system<|en...,,requirements at different stages to potential...
2,<|begin_of_text|><|start_header_id|>system<|en...,,Mind Mapping( Module 3) \n\n Challenge \n\n I...
3,<|begin_of_text|><|start_header_id|>system<|en...,Claudia Sarria (NAME_STUDENT),and build engagement. A lot of people use mi...
4,<|begin_of_text|><|start_header_id|>system<|en...,Rajinder Santos (NAME_STUDENT),Rajinder Santos Design Thinking for Innovati...
...,...,...,...
15850,<|begin_of_text|><|start_header_id|>system<|en...,,Brainstorming \n\n Challenge & Selection \n\n...
15851,<|begin_of_text|><|start_header_id|>system<|en...,,on which ones were the best. We came up with...
15852,<|begin_of_text|><|start_header_id|>system<|en...,,Mind Mapping \n\n Challenge \n\n My consultin...
15853,<|begin_of_text|><|start_header_id|>system<|en...,,had to consider for their portions were : car...


In [52]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in pii_dataset_train['output']:
    num_labels[label] += output.count(label)

print("PII Occurrances:\n")
num_labels

PII Occurrances:



{'NAME_STUDENT': 1128,
 'EMAIL': 34,
 'USERNAME': 6,
 'ID_NUM': 71,
 'PHONE_NUM': 5,
 'URL_PERSONAL': 104,
 'STREET_ADDRESS': 2}

In [53]:
# Counting number of occurances of each label
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
num_labels = {label: 0 for label in pii_labels}

for label in pii_labels:
  for output in custom_pii_dataset_train['output']:
    num_labels[label] += output.count(label)

print("Custom PII Occurrances:\n")
num_labels

Custom PII Occurrances:



{'NAME_STUDENT': 21559,
 'EMAIL': 9716,
 'USERNAME': 10867,
 'ID_NUM': 36661,
 'PHONE_NUM': 7262,
 'URL_PERSONAL': 8145,
 'STREET_ADDRESS': 36159}

## Merge Datasets

In [54]:
dataset_train = pd.concat([alapca_train_dataset, pii_dataset_train, custom_pii_dataset_train])

In [55]:
dataset_train = Dataset.from_pandas(dataset_train)

In [56]:
dataset_train

Dataset({
    features: ['output', 'input', 'instruction', 'text', 'full_text', 'source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set', 'final_text', 'testing_text', '__index_level_0__'],
    num_rows: 97523
})

## Model Training

In [57]:
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=False,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    # bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)


peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map={"": 0},
    use_auth_token=True,
    use_flash_attention_2=True, # use flash attention 2
)


model.config.use_cache = True

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


In [58]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.988 GB.
24.812 GB of memory reserved.


In [59]:
# Set training arguments
training_arguments = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
)


# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    dataset_text_field = "text",
    peft_config=peft_config,
    dataset_num_proc = 2,
    max_seq_length=2048,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    packing = False,
    args=training_arguments,
)

Map (num_proc=2): 100%|██████████| 97523/97523 [00:42<00:00, 2317.34 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [60]:
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="left"

In [61]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)



Step,Training Loss
1,2.8627
2,2.9606
3,2.4868
4,2.9673
5,3.294
6,2.6521
7,2.5352
8,2.6966
9,2.5246
10,2.6886


In [66]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()

23722

In [62]:
###Merge the Base Model with the Trained Adapter
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
#Reload the Base Model and load the QLoRA adapters
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.16s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [63]:
model.save_pretrained("llama2_8b_pii")
tokenizer.save_pretrained("llama2_8b_pii")

('llama2_8b_pii/tokenizer_config.json',
 'llama2_8b_pii/special_tokens_map.json',
 'llama2_8b_pii/tokenizer.json')

In [None]:
model.push_to_hub("javijer/llama2_13b_pii", check_pr=True, use_auth_token="")
tokenizer.push_to_hub("javijer/llama2_13b_pii", check_pr=True, use_auth_token="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")

## vLLM Test Model

In [1]:
%pip install spacy
%pip install kaleido python-multipart typing-extensions
%pip install vllm
%pip install huggingface_hub
%pip install flash-attn
%pip install bitsandbytes



In [1]:
from transformers import AutoTokenizer
from spacy.lang.en import English
from huggingface_hub import login
from vllm import LLM, SamplingParams
import transformers
import torch
import os
import json
import re

# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "javijer/llama2_8b_pii"

access_token = ""

  from .autonotebook import tqdm as notebook_tqdm
2024-04-23 22:31:52,108	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
llm = LLM(model="llama2_8b_pii", gpu_memory_utilization=0.95, max_context_len_to_capture=2048)

INFO 04-23 22:31:55 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='llama2_8b_pii', tokenizer='llama2_8b_pii', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-23 22:31:56 selector.py:16] Using FlashAttention backend.
INFO 04-23 22:32:00 model_runner.py:104] Loading model weights took 14.9595 GB
INFO 04-23 22:32:01 gpu_executor.py:94] # GPU blocks: 2623, # CPU blocks: 2048
INFO 04-23 22:32:01 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-23 22:32:01 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-23 22:32:05 model_runner.py:867] Graph capturing finished in 4 secs.


In [3]:
import json

train_data_path = "train.json"
test_data_path = "test.json"

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

Training Data:  6807
Test Data:  10


In [54]:
def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.


### Input:
John Doe, I live in the 123 Main Street. My website is www.seanhalpin.xyz and my contact number is 888-688-5461.

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

John Doe (NAME_STUDENT)
123 Main Street (STREET_ADDRESS)
www.seanhalpin.xyz (URL_PERSONAL)
830-688-0393 (PHONE_NUM)<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.


### Input:
{text}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}'''

In [27]:
# Test Input
input_text = "Javier Rosa recently moved to 22233 Escalante Run and can be reached at 290-828-2909 for inquiries about the community book club."
input_text

'Javier Rosa recently moved to 22233 Escalante Run and can be reached at 290-828-2909 for inquiries about the community book club.'

In [22]:
format_prompt(input_text)

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n### Instruction:\nYou are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:\n\nThe full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),\nThe email address of a student (EMAIL),\nThe username of a student on any platform (USERNAME),\nA number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),\nA phone number associated with a student (PHONE_NUM),\nA URL that might be used to identify a stu

In [35]:
# Setting Hyperparameters
sampling_params = SamplingParams(temperature=0, max_tokens=512)

In [52]:
# Limiting the data for testing
train_data_size = int(len(train_data_json) * 0.002)
print("Train Data Size: ", train_data_size)

train_data = train_data_json[:train_data_size]

# Test Input
input_text = " ".join(train_data[0]["tokens"][:400])
# input_text = "Javier Rosa recently moved to 22233 Escalante Run and can be reached at 290-828-2909 for inquiries about the community book club."
input_text

Train Data Size:  13


"Design Thinking for innovation reflexion - Avril 2021 - Nathalie Sylla \n\n Challenge & selection \n\n The tool I use to help all stakeholders finding their way through the complexity of a project is the   mind map . \n\n What exactly is a mind map ? According to the definition of Buzan T. and Buzan B. ( 1999 , Dessine - moi   l'intelligence . Paris : Les Éditions d'Organisation . ) , the mind map ( or heuristic diagram ) is a graphic   representation technique that follows the natural functioning of the mind and allows the brain 's   potential to be released . Cf Annex1 \n\n This tool has many advantages : \n\n •   It is accessible to all and does not require significant material investment and can be done   quickly \n\n •   It is scalable \n\n •   It allows categorization and linking of information \n\n •   It can be applied to any type of situation : notetaking , problem solving , analysis , creation of   new ideas \n\n •   It is suitable for all people and is easy to learn \n\n • 

In [48]:
train_data

[{'document': 7,
  'full_text': "Design Thinking for innovation reflexion-Avril 2021-Nathalie Sylla\n\nChallenge & selection\n\nThe tool I use to help all stakeholders finding their way through the complexity of a project is the  mind map.\n\nWhat exactly is a mind map? According to the definition of Buzan T. and Buzan B. (1999, Dessine-moi  l'intelligence. Paris: Les Éditions d'Organisation.), the mind map (or heuristic diagram) is a graphic  representation technique that follows the natural functioning of the mind and allows the brain's  potential to be released. Cf Annex1\n\nThis tool has many advantages:\n\n•  It is accessible to all and does not require significant material investment and can be done  quickly\n\n•  It is scalable\n\n•  It allows categorization and linking of information\n\n•  It can be applied to any type of situation: notetaking, problem solving, analysis, creation of  new ideas\n\n•  It is suitable for all people and is easy to learn\n\n•  It is fun and encourag

In [53]:
# Testing Model
outputs = llm.generate(
    [format_prompt(input_text)],
    sampling_params
    )

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print()
    print("Generated text:\n", generated_text)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


Generated text:
 NAME_STUDENT (Nathalie Sylla)
EMAIL (nathalie.sylla@etu.univ-rennes1.fr)
USERNAME (nathalie.sylla)
ID_NUM (2021)
PHONE_NUM (06 12 34 56 78)
URL_PERSONAL (https://www.linkedin.com/in/nathalie-sylla-0a4b2a1a/)
STREET_ADDRESS (1 Rue de la République, 35000 Rennes, France)





In [None]:
outputs = re.split(r',?\n', generated_text)
outputs = [re.sub(r'((^\d+\.)|(\*))\s*', '', output) for output in outputs]
outputs = [output.strip() for output in outputs if re.search(f"[^)(\s]+\s?\(({pii_labels_pattern})\)", output)]
print("List of PII:\n", outputs)

## Testing

In [33]:
from transformers import AutoTokenizer
from spacy.lang.en import English
from huggingface_hub import login
import transformers
import torch
import os
import json
import re

nlp = English()
english_tokenizer = nlp.tokenizer

pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
pii_labels_pattern = '|'.join(pii_labels)

In [38]:
def find_sequence_indices(list_words, sequence_to_find):
    sequence_length = len(sequence_to_find)
    indices = [i for i in range(len(list_words) - sequence_length + 1) if list_words[i:i+sequence_length] == sequence_to_find]
    return indices

def llama_to_tokens(output):
    nlp = English()

    english_tokenizer = nlp.tokenizer

    tokens = []
    labels = []

    answers = re.split(r'\n',output)
    for i in range(len(answers)):
        tokens.append(re.split(r'\(|\)', answers[i])[:-1])
        labels.append(tokens[-1][-1])
        tokens[-1] = tokens[-1][:-1]

    # print('Tokens', tokens)
    # print('Labels', labels)
    for i in range(len(tokens)):
        # print(tokens[i][0])
        tokenized = english_tokenizer(tokens[i][0])
        tokens[i] = [i.text for i in tokenized]

    return tokens, labels

def categorizer(full_token_list, llm_tokens, labels):
    indices = []
    for i in range(len(llm_tokens)):
        indices.append(find_sequence_indices(full_token_list, llm_tokens[i]))
    # print("Indices", indices)
    result = ['O'] * len(full_token_list) # This will be a list of length full_tokens_list

    for k in range(len(llm_tokens)):
        for i in range(len(indices[k])):
            result[indices[k][i]] = 'B-'+labels[k]
            if len(llm_tokens[k])>1:
                for l in range(len(llm_tokens[k])-1):
                    result[indices[k][i]+l+1] = 'I-' + labels[k]

    return result[:len(full_token_list)]

def assign_labels(full_text, output_text):
    # print('full_text:',full_text)
    tokenized = english_tokenizer(full_text)
    full_text_tokens = [i.text for i in tokenized]
    # print("Full Text Tokens:", full_text_tokens)
    # print('LLM Output:', output_text)

    text_tokens, labels = llama_to_tokens(output_text)
    # print('Text tokens:',text_tokens,'Labels:',labels)

    labeled_output = categorizer(full_text_tokens,text_tokens, labels)
    # print('Final Output:', labeled_output)
    return labeled_output

def curate_labels(labeled_tokens):
    label_pattern = pii_labels_pattern + "|O"

    for i in range(len(labeled_tokens)):
        if(not re.search(label_pattern, labeled_tokens[i])):
            labeled_tokens[i] = 'O'

def get_batches(text, max_length = 400):
  inputs = []
  labels = []
  for j in range(0, len(data["tokens"]), max_length):
      batch_size = min(j + max_length, len(data["tokens"]))
      input_text = " ".join(data["tokens"][j: batch_size])
      output_labels = data["labels"][j: batch_size]
      inputs.append(input_text)
      labels.append(output_labels)

  return (inputs, labels)

In [46]:
def format_prompt(text: str, answer: str = ''):
  if answer != '':
    answer += tokenizer.eos_token

  return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.<|eot_id|><|start_header_id|>user<|end_header_id|>

### Instruction:
You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type. Write each item in the list in the following format: data (PERSONAL INFORMATION TYPE).
If data is not a personal information that fits the previously mentioned criteria, do not include it in the list.


### Input:
{text}

### Response:
<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{answer}'''

In [59]:
train_text_input_ids = []
train_labels_input_ids = []
max_length = 400
total_classifications = 0
num_misclassified = 0
num_hallucinated = 0
error_samples = []
correct_classifications = 0
global_correct_classifications = 0
global_total_classifications = 0
classified_samples = []
row_id = 0

for id, data in enumerate(train_data):
  try:
    document_id = data['document']
    print("Processing Sample:", id)
    print("Document ID:", document_id)
    # Loop through data in batches of 400 tokens
    inputs, labels = get_batches(data)
    print("Number of Text Splits:", len(inputs))
    inputs, labels = get_batches(data)
    print("Number of Text Splits:", len(inputs))

    model_outputs = llm.generate(
        [format_prompt(input) for input in inputs],
        sampling_params
        )
    print()

    min_token_id = 0
    # Print the outputs.
    for input_text, output_labels, model_output in zip(inputs, labels, model_outputs):
        generated_text = model_output.outputs[0].text

        # Process output text
        outputs = re.split(r',?\n', generated_text)
        outputs = [re.sub(r'((^\d+\.)|(\*))\s*', '', output) for output in outputs]
        outputs = [output.strip() for output in outputs if re.search(f"[^)(\s]+\s?\(({pii_labels_pattern})\)", output)]
        print("List of PII:\n", outputs)

        expected_labels = len(output_labels) - output_labels.count('O')
        total_classifications += expected_labels

        if(not outputs):
            num_misclassified += expected_labels
            # print('Invalid Output:')
            # print("Input:\n", input_text)
            # print("Generated Text:\n", generated_text)
            # print("Labels:\n", output_labels)

            continue

        output_text = '\n'.join(outputs)

        # Assigning Labels
        labeled_output = assign_labels(input_text, output_text)
        curate_labels(labeled_output)

        # print("Input:\n", input_text)
        # print("Generated Text:\n", generated_text)
        # print("Labels:\n", output_labels)
        # print("Output:\n", labeled_output)

        # assert len(output_labels) == len(labeled_output)

        # Comparing output with expected labels
        for token_id in range(min_token_id, len(labeled_output)):
            if token_id >= len(output_labels):
                break

            if labeled_output[token_id] != 'O' and output_labels != 'O' and labeled_output[token_id]==output_labels[token_id]:
                print("CORRECT", labeled_output[token_id],output_labels[token_id])
                classified_samples.append({'row_id': row_id, 'document': document_id, 'token': token_id, 'label': output_labels[token_id]})
                total_classifications += 1
                correct_classifications += 1
            elif labeled_output[token_id] != output_labels[token_id] and (labeled_output[token_id]=='O' or output_labels[token_id]=='O'):
                print('Not CORRECT!',labeled_output[token_id],output_labels[token_id])
                classified_samples.append({'row_id': row_id, 'document': document_id, 'token': token_id, 'label': output_labels[token_id]})
                total_classifications += 1
            # else:
                # print("Ignores", labeled_output[token_id],output_labels[token_id])

        min_token_id += len(labeled_output)
        # print("Number Hallucinated:", num_hallucinated)
        # print("Total Missclassified:", num_misclassified)

    print()
    # print("Misclassification:", num_misclassified / total_classifications)
    print(total_classifications, correct_classifications)
    global_correct_classifications += correct_classifications
    global_total_classifications += total_classifications
    print("Accuracy:", (correct_classifications) / total_classifications)
    # print("Accuracy Excluding Hallucinations:", (total_classifications - num_misclassified) / (total_classifications - num_hallucinated))
    print()



  except Exception as error:
     print("Error:", error)
    # error_samples.append((document_id, fragment_id))

print("Gloabal Accuracy:", (global_correct_classifications) / global_total_classifications)
for i, data in enumerate(train_data):
    correct_classifications = 0
    total_classifications = 0
global_correct_classifications = 0
global_total_classifications = 0 # OUTSIDE OF THE LOOP

Processing Sample: 0
Document ID: 7
Number of Text Splits: 2
Number of Text Splits: 2





[A[A[A


[A[A[A


Processed prompts: 100%|██████████| 5/5 [00:11<00:00,  2.32s/it]



List of PII:
 ['Sindy Samaca (NAME_STUDENT)', 'Gitam University (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PH




[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:13<00:00,  6.87s/it]



List of PII:
 ['Diego Estrada (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '123 Main Street (STREET_ADDRESS) νεφοκάλυψης://userZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidth




[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:20<00:00, 10.29s/it]



List of PII:
 ['Gilberto Gamboa (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '123 Main Street (STREET_ADDRESS) νεφοκάλυψης://userZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWid




[A[A[A


Processed prompts: 100%|██████████| 3/3 [00:16<00:00,  5.52s/it]



List of PII:
 ['Sindy Samaca (NAME_STUDENT)', 'Gitam University (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PH




[A[A[A


Processed prompts: 100%|██████████| 5/5 [00:25<00:00,  5.13s/it]



List of PII:
 ['Nadine Born (NAME_STUDENT)']
CORRECT B-NAME_STUDENT B-NAME_STUDENT
CORRECT I-NAME_STUDENT I-NAME_STUDENT
List of PII:
 ['The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT)', 'The email address of a student (EMAIL)', 'The username of a student on any platform (USERNAME)']
List of PII:
 []
List of PII:
 ['John Doe (NAME_STUDENT)', '123 Main Street (STREET_ADDRESS)', 'www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης.”', 'The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT)', 'The email address of a student (EMAIL)', 'The username of a student on any platform (USERNAME)', 'A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM)', 'A phone number associated 




[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:10<00:00,  5.28s/it]



List of PII:
 ['Eladio Amaya (NAME_STUDENT)', 'Cheese Startup - Learning Launch (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '123 Main Street (STREET_ADDRESS)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_PERSONAL)', '만원입니다。www.seanhalpin.xyz (URL_P




[A[A[A


Processed prompts: 100%|██████████| 1/1 [00:11<00:00, 11.23s/it]



List of PII:
 ['Silvia Villalobos (NAME_STUDENT)']
CORRECT B-NAME_STUDENT B-NAME_STUDENT
CORRECT I-NAME_STUDENT I-NAME_STUDENT

41 12
Accuracy: 0.2926829268292683

Processing Sample: 7
Document ID: 104
Number of Text Splits: 2
Number of Text Splits: 2





[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:14<00:00,  7.29s/it]



List of PII:
 ['Dr Sakir Ahmad (NAME_STUDENT)', 'Storytelling   The Path to Innovation (URL_PERSONAL)', 'Challenge & Selection (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL) νεφοκάλυψηςZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpace




[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:20<00:00, 10.40s/it]



List of PII:
 ['Francisco Ferreira (NAME_STUDENT)', 'www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)'




[A[A[A


[A[A[A


Processed prompts: 100%|██████████| 5/5 [00:11<00:00,  2.28s/it]



List of PII:
 ['Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)', 'Stefano Lovato (NAME_STUDENT)']
CORRECT B-NAME_STUDENT B-NAME_STUDENT
CORRECT I-NAME_STUDENT I-NAME_STUDENT
List of PII:
 ['The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT)', 'The email address of a student (EMAIL)', 'The username of a student on any platform (USERNAME)', 'A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM)', 'A phone number associated with a student (PHONE_NUM)', 'A URL that might be used to identify a s




[A[A[A


Processed prompts: 100%|██████████| 2/2 [00:10<00:00,  5.28s/it]



List of PII:
 ['Al (NAME_STUDENT)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '123 Main Street (STREET_ADDRESS) νεφοκάλυψης://userZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWi




[A[A[A


Processed prompts: 100%|██████████| 3/3 [00:10<00:00,  3.60s/it]



List of PII:
 ['Pepa Medrano (NAME_STUDENT)', 'pepa.medrano@gmail.com (EMAIL)', 'pepa.medrano (USERNAME)', '830-688-0393 (PHONE_NUM)', 'www.seanhalpin.xyz (URL_PERSONAL)', '123 Main Street (STREET_ADDRESS) νεφοκάλυψης://网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网刊网




[A[A[A


Processed prompts: 100%|██████████| 3/3 [00:10<00:00,  3.61s/it]


List of PII:
 ['Deiby (NAME_STUDENT)', 'www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψης://www.seanhalpin.xyz (URL_PERSONAL)', '830-688-0393 (PHONE_NUM) νεφοκάλυψηςZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpaceZeroWidthSpa




## Test Model (Ignore)
The generate library seems to be putting out input in another template which messes up our prompt template. Use VLLM instead.

In [None]:
# from peft import PeftModel
# from transformers import AutoTokenizer, AutoModelForCausalLM

# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True

# model = AutoModelForCausalLM.from_pretrained(
#     "javijer/llama2_pii",
#     # max_seq_length = max_seq_length,
#     # dtype = dtype,
#     temperature = 0,
#     max_tokens = 2048
#     load_in_4bit = load_in_4bit,
# )
# tokenizer = AutoTokenizer.from_pretrained("javijer/llama2_pii")

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += tokenizer.eos_token

    return f'''<s>[INST] You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text. You are searching for these different types of information:

* NAME_STUDENT: The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
* EMAIL: A student’s email address.
* USERNAME: A student's username on any platform.
* ID_NUM: A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
* PHONE_NUM: A phone number associated with a student.
* URL_PERSONAL: A URL that might be used to identify a student.
* STREET_ADDRESS: A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information for each type of information it is.
Your OUTPUT should have the following format:
NAME_STUDENT:
* Name 1
* Name 2
EMAIL:
* Email 1
* Email 2
USERNAME:
* Username 1
* Username 2
ID_NUM:
* ID Number 1
* ID Number 2
PHONE_NUM:
* Phone Number 1
* Phone Number 2
URL_PERSONAL:
* URL Personal 1
* URL Personal 2
STREET_ADDRESS:
* Street Address 1
* Street Address 2

TEXT:
{text}
OUTPUT:
[/INST] {answer}'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += tokenizer.eos_token

    return f'''You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text. You are searching for these different types of information:

NAME_STUDENT: The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL: A student’s email address.
USERNAME: A student's username on any platform.
ID_NUM: A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM: A phone number associated with a student.
URL_PERSONAL: A URL that might be used to identify a student.
STREET_ADDRESS: A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information for each type of information it is.
Your OUTPUT should have the following format:
* personal identifiable information (<INFORMATION_TYPE>)
* personal identifiable information (<INFORMATION_TYPE>)

TEXT:
{text}
OUTPUT:
{answer}'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are a helpful and honest assistant trained to identify and categorize these different types of personal identifiable information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
A student's email address (EMAIL),
A student's username on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS)

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are a helpful and honest assistant. You are searching for these different types of personal identifiable information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
A student's email address (EMAIL),
A student's username on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS)

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Your OUTPUT should have the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type.

### Input:
{prompt}

### Response:
'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer = tokenizer.eos_token

    return f'''<s>[INST] You are a helpful and honest assistant trained to identify and categorize Personally Identifiable Information in a given text. You are searching for these different types of Personally Identifiable Information:

The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT),
The email address of a student (EMAIL),
The username of a student on any platform (USERNAME),
A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number (ID_NUM),
A phone number associated with a student (PHONE_NUM),
A URL that might be used to identify a student (URL_PERSONAL),
A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS),

You will be given a text as Input, and your Response will be a list of each instance of Personally Identifiable Information and its type.

### Input:
{text}

### Response:
[/INST] {answer}'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += """</s>"""
    return f'''You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text.

You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

TEXT:
{text}
OUTPUT:
'''

In [None]:
def format_prompt(text: str, answer: str = ''):
    if answer != '':
        answer += """</s>"""
    return f'''
You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text.
You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

TEXT:
{text}
OUTPUT:
'''


In [None]:
train_data_path = "train.json"
test_data_path = "test.json"

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

Training Data:  6807
Test Data:  10


In [None]:
# Limiting the data for testing
train_data_size = int(len(train_data_json) * 0.003)
print("Train Data Size: ", train_data_size)

train_data = train_data_json[:train_data_size]

Train Data Size:  20


In [6]:
# Test Input
# input_text = " ".join(train_data[0]["tokens"][:400])
# input_text = "Heloo, my name is Javier. It is a pleasure to meet you Natalia with phone 210-988-8099"
# input_text += " Javier Rosa to do it."
input_text = "My name is Bryce and my sister's name is Sara. My email is tombombadill@gmail.com and my contact number is 830 688 0393."

In [7]:
inputs = tokenizer([format_prompt(input_text)], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
with torch.cuda.amp.autocast():
  outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True, do_sample = True, temperature = 0.001)
    # model.generate(**tokenizer("test", return_tensors="pt").to("cuda"))
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the outputs.
for response in responses:
  generated_text = response.replace(format_prompt(input_text), '').strip()
  print("Generated text:\n", generated_text)

KeyboardInterrupt: 

In [None]:
inputs = tokenizer([format_prompt(input_text)], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
with torch.cuda.amp.autocast():
  outputs = model.generate(**inputs, max_new_tokens = 2048, use_cache = True, do_sample = True, temperature = 0.001)
    # model.generate(**tokenizer("test", return_tensors="pt").to("cuda"))
responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Print the outputs.
for response in responses:
  generated_text = response.replace(format_prompt(input_text), '').strip()
  print("Generated text:\n", generated_text)

In [None]:
response

In [None]:

pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
pii_labels_pattern = '|'.join(pii_labels)

In [None]:
import re

outputs = re.split(r',?\n', generated_text)
print(outputs)
outputs = [output.strip() for output in outputs if re.search(f"[^)(\s]+\s?\(({pii_labels_pattern})\)", output)]
print("List of PII:\n", outputs)

['[INST] <<SYS>>', 'You are an intelligent assistant trained to identify and categorize Personally Identifiable Information (PII) in a given text.', '<</SYS>>', 'You are searching for these different types of personal identifiable information:', '', 'The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names (NAME_STUDENT)', "A student's email address (EMAIL)", "A student's username on any platform (USERNAME)", 'A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number. (ID_NUM)', 'A phone number associated with a student (PHONE_NUM)', 'A URL that might be used to identify a student (URL_PERSONAL)', 'A full or partial street address that is associated with the student, such as their home address (STREET_ADDRESS)', '', 'You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and i