In [None]:
!pip install torch==2.1.0
!pip install spacy
!pip install vllm
!pip install kaleido python-multipart typing-extensions
!pip install huggingface_hub

Collecting vllm
  Downloading vllm-0.4.0.post1-cp310-cp310-manylinux1_x86_64.whl (97.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Collecting ninja (from vllm)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
Collecting ray>=2.9 (from vllm)
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting torch==2.1.2 (from vllm)
  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.39.1 (from vllm)
  Downloading transformers-4.39.3-py3-none-

In [None]:
from transformers import AutoTokenizer
from spacy.lang.en import English
from huggingface_hub import login
from vllm import LLM, SamplingParams
import transformers
import torch
import os
import json
import re

model_name = "meta-llama/Llama-2-7b-chat-hf"
access_token = "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"


login(token=access_token, add_to_git_credential=True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
llm = LLM(model=model_name, gpu_memory_utilization=0.9)

In [None]:
nlp = English()
english_tokenizer = nlp.tokenizer

In [None]:
pii_labels = ['NAME_STUDENT', 'EMAIL', 'USERNAME', 'ID_NUM', 'PHONE_NUM', 'URL_PERSONAL', 'STREET_ADDRESS']
pii_labels_pattern = '|'.join(pii_labels)

In [None]:
def find_sequence_indices(list_words, sequence_to_find):
    sequence_length = len(sequence_to_find)
    indices = [i for i in range(len(list_words) - sequence_length + 1) if list_words[i:i+sequence_length] == sequence_to_find]
    return indices

def llama_to_tokens(output):
    nlp = English()

    english_tokenizer = nlp.tokenizer

    tokens = []
    labels = []

    answers = re.split(r'\n',output)
    for i in range(len(answers)):
        tokens.append(re.split(r'\(|\)', answers[i])[:-1])
        labels.append(tokens[-1][-1])
        tokens[-1] = tokens[-1][:-1]

    # print('Tokens', tokens)
    # print('Labels', labels)
    for i in range(len(tokens)):
        # print(tokens[i][0])
        tokenized = english_tokenizer(tokens[i][0])
        tokens[i] = [i.text for i in tokenized]

    return tokens, labels

def categorizer(full_token_list, llm_tokens, labels):
    indices = []
    for i in range(len(llm_tokens)):
        indices.append(find_sequence_indices(full_token_list, llm_tokens[i]))
    # print("Indices", indices)
    result = ['O'] * len(full_token_list) # This will be a list of length full_tokens_list

    for k in range(len(llm_tokens)):
        for i in range(len(indices[k])):
            result[indices[k][i]] = 'B-'+labels[k]
            if len(llm_tokens[k])>1:
                for l in range(len(llm_tokens[k])-1):
                    result[indices[k][i]+l+1] = 'I-' + labels[k]

    return result[:len(full_token_list)]

def assign_labels(full_text, output_text):
    # print('full_text:',full_text)
    tokenized = english_tokenizer(full_text)
    full_text_tokens = [i.text for i in tokenized]
    # print("Full Text Tokens:", full_text_tokens)
    # print('LLM Output:', output_text)

    text_tokens, labels = llama_to_tokens(output_text)
    # print('Text tokens:',text_tokens,'Labels:',labels)

    labeled_output = categorizer(full_text_tokens,text_tokens, labels)
    # print('Final Output:', labeled_output)
    return labeled_output

def curate_labels(labeled_tokens):
    label_pattern = pii_labels_pattern + "|O"

    for i in range(len(labeled_tokens)):
        if(not re.search(label_pattern, labeled_tokens[i])):
            labeled_tokens[i] = 'O'

def get_batches(text, max_length = 400):
  inputs = []
  labels = []
  for j in range(0, len(data["tokens"]), max_length):
      batch_size = min(j + max_length, len(data["tokens"]))
      input_text = " ".join(data["tokens"][j: batch_size])
      output_labels = data["labels"][j: batch_size]
      inputs.append(input_text)
      labels.append(output_labels)

  return (inputs, labels)

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST] <<SYS>>
You are a helpful and honest assistant.
<</SYS>>

You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information and which type of information it is.

TEXT:
My name is Bryce and my sister's name is Sara. My email is tombombadill@gmail.com and my contact number is 830 688 0393.
OUTPUT:
[/INST]
Bryce (NAME_STUDENT),
Sara (NAME_STUDENT),
tombombadill@gmail.com (EMAIL),
830 688 0393 (PHONE_NUM)
</s>
<s>[INST]
You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information belonging to the previous types and which type they are.

TEXT:
John Doe , I live in the 123 Main Street. My website is www.seanhalpin.xyz and my contact number is 888-688-5461.
OUTPUT:
[/INST]
John Doe (NAME_STUDENT),
123 Main Street (STREET_ADDRESS),
www.seanhalpin.xyz (URL_PERSONAL),
830-688-0393 (PHONE_NUM)
</s>
<s>[INST]
You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information belonging to the previous type and which type they are.

TEXT:
The hallways of Greenwood High, everyone knew that if you needed help with calculus, you would look for Jamie Turner whose ID is GHS20241015. She known to have a knack for numbers.
OUTPUT:
[/INST]
Jamie Turner (NAME_STUDENT),
GHS20241015 (ID_NUM)
</s>
<s>[INST]
You are searching for these different types of information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of information belonging to the previous type and which type they are.
Please, format the list in the following format:
<information> (<TYPE>),
<information> (<TYPE>)

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Please, format the list in the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
My name is Bryce and my sister's name is Sara. My email is tombombadill@gmail.com and my contact number is 830 688 0393.
OUTPUT:
Bryce (NAME_STUDENT),
Sara (NAME_STUDENT),
tombombadill@gmail.com (EMAIL),
830 688 0393 (PHONE_NUM)

You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Please, format the list in the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
John Doe , I live in the 123 Main Street. My website is www.seanhalpin.xyz and my contact number is 888-688-5461.
OUTPUT:
John Doe (NAME_STUDENT),
123 Main Street (STREET_ADDRESS),
www.seanhalpin.xyz (URL_PERSONAL),
830-688-0393 (PHONE_NUM)

You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Please, format the list in the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
def format_prompt(prompt: str):
    return f'''<s>[INST]
You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student’s email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Please, format the list in the following format:
Please, format the list in the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
My name is Bryce and my sister's name is Sara. My email is tombombadill@gmail.com and my contact number is 830 688 0393.
OUTPUT:
[/INST]
Bryce (NAME_STUDENT),
Sara (NAME_STUDENT),
tombombadill@gmail.com (EMAIL),
830 688 0393 (PHONE_NUM)
</s>
<s>[INST]
You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Please, format the list in the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
John Doe , I live in the 123 Main Street. My website is www.seanhalpin.xyz and my contact number is 888-688-5461.
OUTPUT:
[/INST]
John Doe (NAME_STUDENT),
123 Main Street (STREET_ADDRESS),
www.seanhalpin.xyz (URL_PERSONAL),
830-688-0393 (PHONE_NUM)
</s>
<s>[INST]
You are searching for these different types of personal identifiable information:

NAME_STUDENT - The full or partial name of a student that is not necessarily the author of the essay. This excludes instructors, authors, and other person names.
EMAIL - A student's email address.
USERNAME - A student's username on any platform.
ID_NUM - A number or sequence of characters that could be used to identify a student, such as a student ID or a social security number.
PHONE_NUM - A phone number associated with a student.
URL_PERSONAL - A URL that might be used to identify a student.
STREET_ADDRESS - A full or partial street address that is associated with the student, such as their home address.

You will be given a TEXT, and your OUTPUT will be a list of each instance of personal identifiable information and its type.
Your OUTPUT should have the following format:
<personal identifiable information> (<INFORMATION_TYPE>),
<personal identifiable information> (<INFORMATION_TYPE>)

TEXT:
{prompt}
OUTPUT:
[/INST]
'''

In [None]:
train_data_path = "pii-detection-data/train.json"
test_data_path = "pii-detection-data/test.json"

# Loading Dataset
with open(train_data_path) as file:
    train_data_json = json.load(file)
    print("Training Data: ", len(train_data_json))

with open(test_data_path ) as file:
    test_data_json = json.load(file)
    print("Test Data: ", len(test_data_json))

In [None]:
# Limiting the data for testing
train_data_size = int(len(train_data_json) * 0.002)
print("Train Data Size: ", train_data_size)

train_data = train_data_json[:train_data_size]

In [None]:
input_text = " ".join(train_data[0]["tokens"][:400])
format_prompt(input_text)

In [None]:
# Setting Hyperparameters
sampling_params = SamplingParams(temperature=0, max_tokens=2048)

In [None]:
# Test Input
input_text = " ".join(train_data[0]["tokens"][400:])
input_text

In [None]:
# Testing Model
outputs = llm.generate(
    [format_prompt(input_text)],
    sampling_params
    )

# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print()
    print("Generated text:\n", generated_text)

In [None]:
outputs = re.split(r',?\n', generated_text)
print(outputs)
outputs = [output.strip() for output in outputs if re.search(f"\S+\s?\(({pii_labels_pattern})\)", output)]
print("List of PII:\n", outputs)

In [None]:
train_text_input_ids = []
train_labels_input_ids = []
max_length = 400
total_classifications = 0
num_misclassified = 0
num_hallucinated = 0

try:
    for i, data in enumerate(train_data):
        print("Processing Sample:", i)
        # Loop through data in batches of 400 tokens
        inputs, labels = get_batches(data)
        print("Number of Text Splits:", len(inputs))

        model_outputs = llm.generate(
            [format_prompt(input) for input in inputs],
            sampling_params
            )
        print()

        # Print the outputs.
        for input_text, output_labels, model_output in zip(inputs, labels, model_outputs):
            generated_text = model_output.outputs[0].text

            # Process output text
            outputs = re.split(r',?\n', generated_text)
            outputs = [output.strip() for output in outputs if re.search(f"\S+\s?\(({pii_labels_pattern})\)", output)]
            print("List of PII:\n", outputs)

            expected_labels = len(output_labels) - output_labels.count('O')
            total_classifications += expected_labels

            if(not outputs):
                num_misclassified += expected_labels
                print('Invalid Output:')
                print("Input:\n", input_text)
                print("Generated Text:\n", generated_text)
                print("Labels:\n", output_labels)

                continue

            output_text = '\n'.join(outputs)

            # Assigning Labels
            labeled_output = assign_labels(input_text, output_text)
            curate_labels(labeled_output)

            print("Input:\n", input_text)
            print("Generated Text:\n", generated_text)
            print("Labels:\n", output_labels)
            print("Output:\n", labeled_output)

            assert len(output_labels) == len(labeled_output)

            # Comparing output with expected labels
            for i in range(len(labeled_output)):
                if(labeled_output[i] == output_labels[i]):
                  continue

                if(output_labels[i] == 'O'):
                    num_hallucinated += 1
                    num_misclassified += 1
                    total_classifications += 1
                else:
                    num_misclassified += 1

            print("Number Hallucinated:", num_hallucinated)
            print("Total Missclassified:", num_misclassified)

        print()
        print("Misclassification:", num_misclassified / total_classifications)
        print("Accuracy:", (total_classifications - num_misclassified) / total_classifications)
        print("Accuracy Excluding Hallucinations:", (total_classifications - num_misclassified) / (total_classifications - num_hallucinated))
        print()

except Exception as error:
    print("\nError Occured for the following input:")
    print("INPUT:", input_text)
    print("EXPECTED OUTPUT:", output_labels)
    print("GENERATED TEXT:", generated_text)
    print("PROCESSED OUTPUT:", outputs)
    print("LABELED OUTPUT:", labeled_output)
    print("ERROR:", error)
