<a href="https://colab.research.google.com/github/jenniferreyesdev/Synthetic-Data-For-PII-Masking/blob/main/JReyes_PII_Legal_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Prerequisites**

Install from PyPI:

In [1]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg
!pip install pdfminer.six
!pip install pikepdf
!pip install pandas
!pip install numpy
!pip install sdv
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install torch==1.13.1


2024-01-24 06:03:24.938051: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-24 06:03:24.942302: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-24 06:03:24.944101: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-24 06:03:24.959299: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-lg==3.6.0
  Downloading https:

In [2]:
# For Presidio
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# For extracting text
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine

#For grouping
from operator import itemgetter
from itertools import groupby
import pandas as pd
import numpy as np

#For Transform & Anonymize
from sdv.metadata import SingleTableMetadata
from rdt import HyperTransformer
from rdt.transformers.pii import PseudoAnonymizedFaker


import logging
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
import click
import numpy as np
from datasets import Dataset, load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
PreTrainedTokenizer,
Trainer,
TrainingArguments,
set_seed,
)

from transformers import (pipeline,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    TopKLogitsWarper,
    TemperatureLogitsWarper,
    StoppingCriteriaList,
    MaxLengthCriteria,
    Trainer,
    TrainingArguments,
    GenerationConfig)
import os
import torch

from google.colab import drive
drive.flush_and_unmount()


Drive not mounted, so nothing to flush and unmount.


**Analyze the text in the PDF**

In [3]:
analyzer = AnalyzerEngine()

analyzed_character_sets = []
characters_final = []
characters_2 = []
start_lst = []
end_lst = []

for page_layout in extract_pages("./PII_Legal_LLM_JReyes.pdf"):
    for text_container in page_layout:
        if isinstance(text_container, LTTextContainer):

            # The element is a LTTextContainer, containing a paragraph of text.
            text_to_anonymize = text_container.get_text()

            # Analyze the text using the analyzer engine
            analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')


            characters = list([])

            # Grab the characters from the PDF
            for text_line in text_to_anonymize:
                    characters.append(text_line)

            # Slice out the characters that match the analyzer results.
            for i in range(0, len(analyzer_results)):
                start = analyzer_results[i].start
                end = analyzer_results[i].end
                analyzed_character_sets.append({"characters": ''.join(characters[start:end]), "entity_type": analyzer_results[i].entity_type})
                characters_2.append(characters)
                start_lst.append(start)
                end_lst.append(end)
            characters_final.append(characters)
df = pd.DataFrame.from_records(analyzed_character_sets).groupby(['entity_type'])['characters'].apply(list)
new_person = []
for x in df.PERSON:
  if '\n' in x:
    for y in x.split('\n'):
      new_person.append(y)
  else:
    new_person.append(x)
df.PERSON = new_person
data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in df.items() ]))
print(data)




    DATE_TIME      IN_PAN      LOCATION MEDICAL_LICENSE           PERSON  \
0  1982-02-22  Rogersfurt      TN 48511       CV6642133      James Clark   
1  1971-04-20  withdrawal  West Gabriel       EX4578247  Kathryn Alvarez   
2  2018-11-19         NaN           NaN             NaN  Nathaniel Smith   
3  2007-06-17         NaN           NaN             NaN      North Aaron   
4         NaN         NaN           NaN             NaN    Billy Parrish   

        PHONE_NUMBER  
0       805-454-3206  
1       668.951.1735  
2  971.439.0527x7186  
3                NaN  
4                NaN  


**Transform & Anonymize Data**

In [4]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=data)
metadata.update_column(
    column_name='DATE_TIME',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')
metadata.update_column(
    column_name='MEDICAL_LICENSE',
    sdtype='bban')
metadata.update_column(
    column_name='PERSON',
    sdtype='name')
metadata.update_column(
    column_name='PHONE_NUMBER',
    sdtype='phone_number')
synthesizer = HyperTransformer()
synthesizer.detect_initial_config(data)
synthesizer.update_sdtypes(column_name_to_sdtype={
  'DATE_TIME': 'pii',
  'MEDICAL_LICENSE': 'pii',
  'PERSON': 'pii',
  'PHONE_NUMBER': 'pii'
})
synthesizer.update_transformers(column_name_to_transformer={
    'MEDICAL_LICENSE': PseudoAnonymizedFaker(provider_name='bank', function_name='bban'),
    'PERSON': PseudoAnonymizedFaker(provider_name='person', function_name='name'),
    'PHONE_NUMBER': PseudoAnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
})
synthesizer.fit(data)
med_license = synthesizer.get_config()['transformers']['MEDICAL_LICENSE']
paf = synthesizer.get_config()['transformers']['PERSON']
phone_number = synthesizer.get_config()['transformers']['PHONE_NUMBER']

pii_k = []
pii_v = []
for k, v in med_license.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in paf.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in phone_number.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)

pii_k = np.array(pii_k)
pii_v = np.array(pii_v)

characters_final = ''.join([item for sublist in characters_final for item in sublist])
for i in range(0, len(characters_2)):
  pii_1 = ''.join(characters_2[i][start_lst[i]:end_lst[i]])
  if pii_1 in characters_final:
    value_1 = pii_v[np.argwhere(pii_1==pii_k).flatten()]
    if value_1.size > 0:
      characters_final = characters_final.replace(pii_1, value_1[0])

#Final Anonymized string which hides PII
print(characters_final)





DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
Colleen Williams
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 274.357.8632x2124
Has account number IVNOGH868803981530138943623 been closed?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Bryan Harris
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: (705)540-8087
Please expedite the credit card application for account number EITXKT500282463110241508389. When
will this application be ready?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Debbie Byrd
5271 Bolton Cliffs Suite 617
Robert Hunt, TN 48511
Phone No.: 001-591-758-3596x3282
Has the mortgage request been processed for account number QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Andrea Lawrence
2516 Sara Mountains
West Gabriel, NY 17877
Phone No.: 411-904-6891
Ca

**Reverse Transformation of PII**

In [5]:
characters_original = characters_final
for i in range(0, len(pii_v)):
  characters_original = characters_original.replace(pii_v[i], pii_k[i])
print(characters_original)

DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
James Clark
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 805-454-3206
Has account number IVCV66421338943623 been closed?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Kathryn Alvarez
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: 668.951.1735
Please expedite the credit card application for account number EIEX45782471508389. When
will this application be ready?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Nathaniel Smith
5271 Bolton Cliffs Suite 617
North Aaron, TN 48511
Phone No.: 971.439.0527x7186
Has the mortgage request been processed for account number QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Billy Parrish
2516 Sara Mountains
West Gabriel, NY 17877
Phone No.: 411-904-6891
Can you please prepare a withd

**Extract questions from anonymized string for use in LLM**

In [6]:
words = characters_final
print(words)
questions = []
while True:
  sub1 = 'No.:'
  sub2 = '?'

  idx1 = words.find(sub1)
  if idx1 == -1:
    break
  idx2 = words.find(sub2)

  for i in range(0, len(words)):
    res = ''
    for idx in range(idx1 + len(sub1) + 1, idx2):
        res = res + words[idx]
  res = ' '.join(res.split('\n')[1:])
  questions.append(res)
  words = words[idx2+1:]

questions = np.array(questions)
print(questions)
labels = np.array(['Sure. Your debit card is on its way.', 'Of course. Your debit card is on its way.', 'Yes, you do.', 'Right away.'])

fine_tuning_training_set = pd.DataFrame({'text':questions, 'label':labels})
print(fine_tuning_training_set)



DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
Colleen Williams
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 274.357.8632x2124
Has account number IVNOGH868803981530138943623 been closed?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Bryan Harris
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: (705)540-8087
Please expedite the credit card application for account number EITXKT500282463110241508389. When
will this application be ready?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Debbie Byrd
5271 Bolton Cliffs Suite 617
Robert Hunt, TN 48511
Phone No.: 001-591-758-3596x3282
Has the mortgage request been processed for account number QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Andrea Lawrence
2516 Sara Mountains
West Gabriel, NY 17877
Phone No.: 411-904-6891
Ca

**LLM Model**

In [8]:

model_name = "google/flan-t5-small"
nlp = pipeline("text2text-generation", model=model_name)
train_texts = fine_tuning_training_set.text.to_numpy().tolist()
for x in train_texts:
  reponse = nlp(x, truncation=True, min_length=1, max_length=int(len(x.split())/2))
  print(reponse)



[{'generated_text': 'no'}]
[{'generated_text': '- 16:00'}]
[{'generated_text': 'no'}]
[{'generated_text': 'Is there anything else I can help you with'}]
