<a href="https://colab.research.google.com/github/jenniferreyesdev/jenniferreyesdev/blob/main/JReyes_PII_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Prerequisites**

Install from PyPI:

In [None]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg
!pip install pdfminer.six
!pip install pikepdf
!pip install pandas
!pip install numpy
!pip install sdv
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install torch==1.13.1


Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
# For Presidio
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# For extracting text
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine

#For grouping
from operator import itemgetter
from itertools import groupby
import pandas as pd
import numpy as np

#For Transform & Anonymize
from sdv.metadata import SingleTableMetadata
from rdt import HyperTransformer
from rdt.transformers.pii import PseudoAnonymizedFaker


import logging
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
import click
import numpy as np
from datasets import Dataset, load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
PreTrainedTokenizer,
Trainer,
TrainingArguments,
set_seed,
)

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import os
import torch

from google.colab import drive
drive.flush_and_unmount()


**Analyze the text in the PDF**

In [None]:
analyzer = AnalyzerEngine()

analyzed_character_sets = []
characters_final = []
characters_2 = []
start_lst = []
end_lst = []

for page_layout in extract_pages("./PII_Sample.pdf"):
    for text_container in page_layout:
        if isinstance(text_container, LTTextContainer):

            # The element is a LTTextContainer, containing a paragraph of text.
            text_to_anonymize = text_container.get_text()

            # Analyze the text using the analyzer engine
            analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')


            characters = list([])

            # Grab the characters from the PDF
            for text_line in text_to_anonymize:
                    characters.append(text_line)

            # Slice out the characters that match the analyzer results.
            for i in range(0, len(analyzer_results)):
                start = analyzer_results[i].start
                end = analyzer_results[i].end
                analyzed_character_sets.append({"characters": ''.join(characters[start:end]), "entity_type": analyzer_results[i].entity_type})
                characters_2.append(characters)
                start_lst.append(start)
                end_lst.append(end)
            characters_final.append(characters)
df = pd.DataFrame.from_records(analyzed_character_sets).groupby(['entity_type'])['characters'].apply(list)
new_person = []
for x in df.PERSON:
  if '\n' in x:
    for y in x.split('\n'):
      new_person.append(y)
  else:
    new_person.append(x)
df.PERSON = new_person
data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in df.items() ]))
print(data)




    DATE_TIME      LOCATION MEDICAL_LICENSE           PERSON  \
0  1982-02-22      TN 48511       CV6642133      James Clark   
1  1971-04-20  West Gabriel       EX4578247  Kathryn Alvarez   
2  2018-11-19           NaN             NaN  Nathaniel Smith   
3  2007-06-17           NaN             NaN      North Aaron   
4         NaN           NaN             NaN    Billy Parrish   

        PHONE_NUMBER  
0       805-454-3206  
1       668.951.1735  
2  971.439.0527x7186  
3                NaN  
4                NaN  


**Transform & Anonymize Data**

In [None]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=data)
metadata.update_column(
    column_name='DATE_TIME',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')
metadata.update_column(
    column_name='MEDICAL_LICENSE',
    sdtype='bban')
metadata.update_column(
    column_name='PERSON',
    sdtype='name')
metadata.update_column(
    column_name='PHONE_NUMBER',
    sdtype='phone_number')
synthesizer = HyperTransformer()
synthesizer.detect_initial_config(data)
synthesizer.update_sdtypes(column_name_to_sdtype={
  'DATE_TIME': 'datetime',
  'MEDICAL_LICENSE': 'pii',
  'PERSON': 'pii',
  'PHONE_NUMBER': 'pii'
})
synthesizer.update_transformers(column_name_to_transformer={
    'MEDICAL_LICENSE': PseudoAnonymizedFaker(provider_name='bank', function_name='bban'),
    'PERSON': PseudoAnonymizedFaker(provider_name='person', function_name='name'),
    'PHONE_NUMBER': PseudoAnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
})
synthesizer.fit(data)
med_license = synthesizer.get_config()['transformers']['MEDICAL_LICENSE']
paf = synthesizer.get_config()['transformers']['PERSON']
phone_number = synthesizer.get_config()['transformers']['PHONE_NUMBER']

pii_k = []
pii_v = []
for k, v in med_license.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in paf.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in phone_number.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)

pii_k = np.array(pii_k)
pii_v = np.array(pii_v)

characters_final = ''.join([item for sublist in characters_final for item in sublist])
for i in range(0, len(characters_2)):
  pii_1 = ''.join(characters_2[i][start_lst[i]:end_lst[i]])
  if pii_1 in characters_final:
    value_1 = pii_v[np.argwhere(pii_1==pii_k).flatten()]
    if value_1.size > 0:
      characters_final = characters_final.replace(pii_1, value_1[0])

#Final Anonymized string which hides PII
print(characters_final)



DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
Andrea Lawrence
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: (054)080-8788
Can you please provide an update for the debit card for the checking account number
IVTXKT500282463110248943623?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Colleen Williams
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: 745.786.3221x2401
Phone
Where is the card I ordered for my bank account number EINOGH868803981530131508389?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Debbie Byrd
5271 Bolton Cliffs Suite 617
Bryan Harris, TN 48511
Phone No.: +1-539-158-3596x328
Is it confirmed if I have a new card coming in the mail for the checking account number
QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Robert Hunt
2516 Sara Mountains
West Gab

**Reverse Transformation of PII**

In [None]:
characters_original = characters_final
for i in range(0, len(pii_v)):
  characters_original = characters_original.replace(pii_v[i], pii_k[i])
print(characters_original)

DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
James Clark
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 805-454-3206
Can you please provide an update for the debit card for the checking account number
IVCV66421338943623?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Kathryn Alvarez
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: 668.951.1735
Phone
Where is the card I ordered for my bank account number EIEX45782471508389?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Nathaniel Smith
5271 Bolton Cliffs Suite 617
North Aaron, TN 48511
Phone No.: 971.439.0527x7186
Is it confirmed if I have a new card coming in the mail for the checking account number
QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Billy Parrish
2516 Sara Mountains
West Gabriel, NY 17877
Phone No.: 

**Extract questions from anonymized string for use in LLM**

In [None]:
words = characters_final

questions = []
while True:
  sub1 = 'No.:'
  sub2 = '?'

  idx1 = words.find(sub1)
  if idx1 == -1:
    break
  idx2 = words.find(sub2)

  for i in range(0, len(words)):
    res = ''
    for idx in range(idx1 + len(sub1) + 1, idx2):
        res = res + words[idx]
  questions.append(res)
  words = words[idx2+1:]

questions = np.array(questions)
labels = np.array([11,11,11,46])

fine_tuning_training_set = pd.DataFrame({'text':questions, 'label':labels})
print(fine_tuning_training_set)



                                                text  label
0  (054)080-8788\nCan you please provide an updat...     11
1  745.786.3221x2401\nPhone\nWhere is the card I ...     11
2  +1-539-158-3596x328\nIs it confirmed if I have...     11
3  411-904-6891\nCan you prepare a withdrawal req...     46


**LLM Model**

In [None]:
def load_training_dataset(path_or_dataset: str = "PolyAI/banking77") -> Dataset:
  logger.info(f"Loading dataset from {path_or_dataset}")
  dataset = load_dataset(path_or_dataset)["train"]
  logger.info("Found %d rows", dataset.num_rows)
def _add_text(rec):
  instruction = rec["instruction"]
  response = rec["response"]
  context = rec.get("context")
  if not instruction:
    raise ValueError(f"Expected an instruction in: {rec}")
  if not response:
    raise ValueError(f"Expected a response in: {rec}")
  if context:
    rec["text"] = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, response=response, input=context)
  else:
    rec["text"] = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction, response=response)
    return rec
  dataset = dataset.map(_add_text)
  return dataset

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed=42) -> Dataset:
  dataset = load_training_dataset()
  print('dataset')
  print(dataset[0])
  logger.info("Preprocessing dataset")
  _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
  dataset = dataset.map(
  _preprocessing_function,
  batched=True,
  remove_columns=["instruction", "context", "response", "text", "category"],
  )
  logger.info("Processed dataset has %d rows", dataset.num_rows)
  dataset = dataset.filter(lambda rec: len(rec["input_ids"]) < max_length)
  logger.info("Processed dataset has %d rows after filtering for truncated records", dataset.num_rows)
  logger.info("Shuffling dataset")
  dataset = dataset.shuffle(seed=seed)
  logger.info("Done preprocessing")
  return dataset

In [None]:
def train(
  *,
  input_model: str,
  local_output_dir: str,
  dbfs_output_dir: str,
  epochs: int,
  per_device_train_batch_size: int,
  per_device_eval_batch_size: int,
  lr: float,
  seed: int,
  deepspeed: str,
  gradient_checkpointing: bool,
  local_rank: str,
  bf16: bool,
  logging_steps: int,
  save_steps: int,
  eval_steps: int,
  test_size: Union[float, int],
  save_total_limit: int,
  warmup_steps: int,
  ):
  set_seed(seed)

  model, tokenizer = get_model_tokenizer(
  pretrained_model_name_or_path=input_model, gradient_checkpointing=gradient_checkpointing
  )

  # Use the same max length that the model supports. Fall back to 1024 if the setting can't be found.
  # The configuraton for the length can be stored under different names depending on the model. Here we attempt
  # a few possible names we've encountered.
  conf = model.config
  max_length = None
  for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
    max_length = getattr(model.config, length_setting, None)
    if max_length:
      logger.info(f"Found max lenth: {max_length}")
      break
    if not max_length:
      max_length = 1024
      logger.info(f"Using default max length: {max_length}")

  processed_dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed)

  split_dataset = processed_dataset.train_test_split(test_size=test_size, seed=seed)

  logger.info("Train data size: %d", split_dataset["train"].num_rows)
  logger.info("Test data size: %d", split_dataset["test"].num_rows)

  data_collator = DataCollatorForCompletionOnlyLM(
  tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
  )

  if not dbfs_output_dir:
    logger.warn("Will NOT save to DBFS")

  training_args = TrainingArguments(
  output_dir=local_output_dir,
  per_device_train_batch_size=per_device_train_batch_size,
  per_device_eval_batch_size=per_device_eval_batch_size,
  fp16=False,
  bf16=bf16,
  learning_rate=lr,
  num_train_epochs=epochs,
  deepspeed=deepspeed,
  gradient_checkpointing=gradient_checkpointing,
  logging_dir=f"{local_output_dir}/runs",
  logging_strategy="steps",
  logging_steps=logging_steps,
  evaluation_strategy="steps",
  eval_steps=eval_steps,
  save_strategy="steps",
  save_steps=save_steps,
  save_total_limit=save_total_limit,
  load_best_model_at_end=False,
  report_to="tensorboard",
  disable_tqdm=True,
  remove_unused_columns=False,
  local_rank=local_rank,
  warmup_steps=warmup_steps,
  )

  logger.info("Instantiating Trainer")

  trainer = Trainer(
  model=model,
  tokenizer=tokenizer,
  args=training_args,
  train_dataset=split_dataset["train"],
  eval_dataset=split_dataset["test"],
  data_collator=data_collator,
  )

  logger.info("Training")
  trainer.train()

  logger.info(f"Saving Model to {local_output_dir}")
  trainer.save_model(output_dir=local_output_dir)

  if dbfs_output_dir:
    logger.info(f"Saving Model to {dbfs_output_dir}")
  trainer.save_model(output_dir=dbfs_output_dir)

  logger.info("Done.")

In [None]:


# load pre-trained language model and tokenizer
model_name = "microsoft/CodeGPT-small-java"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# prepare data for fine-tuning
train_texts = fine_tuning_training_set.values[:, 0].tolist()
train_labels = fine_tuning_training_set.values[:, 1].tolist()

print(train_texts)
print(train_labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

class FTDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FTDataset(train_encodings, train_labels)
print(train_dataset[0])

# fine-tune the model
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy = "epoch",
learning_rate=.00001,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=1,
weight_decay=0.01,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
trainer.train()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['(054)080-8788\nCan you please provide an update for the debit card for the checking account number\nIVTXKT500282463110248943623', '745.786.3221x2401\nPhone\nWhere is the card I ordered for my bank account number EINOGH868803981530131508389', '+1-539-158-3596x328\nIs it confirmed if I have a new card coming in the mail for the checking account number\nQQQS27528544402592', '411-904-6891\nCan you prepare a withdrawal request in the amount of $15,000 from my checking account\nnumber TTVY24567636706751']
[11, 11, 11, 46]
{'input_ids': tensor([   10,    18,  6065,    11,    18,  2679,    15,  5939,  4760,   201,
         2203,  5749, 16276,  8959,   916,  1400,   438,   463,   578,  4449,
         8348,   438,   463, 10801,  4198,  2276,   201,  5697,  9993, 36500,
         7943,  4422,  2936, 12151, 22031,  2936,  5775,    22,  5629,  3987,
            1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       



ValueError: ignored