**Prerequisites**

Install from PyPI:

In [1]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg
!pip install pdfminer.six
!pip install pikepdf
!pip install pandas
!pip install numpy
!pip install sdv
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install torch==1.13.1


Collecting presidio_analyzer
  Downloading presidio_analyzer-2.2.351-py3-none-any.whl (80 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.1/80.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tldextract (from presidio_analyzer)
  Downloading tldextract-5.1.0-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.7/97.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting phonenumbers<9.0.0,>=8.12 (from presidio_analyzer)
  Downloading phonenumbers-8.13.24-py2.py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract->presidio_analyzer)
  Downloading requests_file-1.5.1-py2.py3-none-any.whl (3.7 kB)
Installing collected packages: phonenumbers, requests-file, tldextract, 

Collecting sdv
  Downloading sdv-1.6.0-py2.py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Downloading boto3-1.28.84-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<2,>=1.18 (from sdv)
  Downloading botocore-1.31.84-py3-none-any.whl (11.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting copulas<0.10,>=0.9.0 (from sdv)
  Downloading copulas-0.9.2-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan<0.8,>=0.7.4 (from sdv)
  Downloading ctgan-0.7.5-py2.py3-none-any.whl (27 kB)
Collecting deepecho<0.5,>=0.4.2 (from sdv

In [24]:
# For Presidio
from presidio_analyzer import AnalyzerEngine, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# For extracting text
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LTTextContainer, LTChar, LTTextLine

#For grouping
from operator import itemgetter
from itertools import groupby
import pandas as pd
import numpy as np

#For Transform & Anonymize
from sdv.metadata import SingleTableMetadata
from rdt import HyperTransformer
from rdt.transformers.pii import PseudoAnonymizedFaker


import logging
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
import click
import numpy as np
from datasets import Dataset, load_dataset
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
PreTrainedTokenizer,
Trainer,
TrainingArguments,
set_seed,
)

from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments
import os
import torch

from google.colab import drive
drive.flush_and_unmount()


Drive not mounted, so nothing to flush and unmount.


**Analyze the text in the PDF**

In [25]:
analyzer = AnalyzerEngine()

analyzed_character_sets = []
characters_final = []
characters_2 = []
start_lst = []
end_lst = []

for page_layout in extract_pages("./PII_Sample.pdf"):
    for text_container in page_layout:
        if isinstance(text_container, LTTextContainer):

            # The element is a LTTextContainer, containing a paragraph of text.
            text_to_anonymize = text_container.get_text()

            # Analyze the text using the analyzer engine
            analyzer_results = analyzer.analyze(text=text_to_anonymize, language='en')


            characters = list([])

            # Grab the characters from the PDF
            for text_line in text_to_anonymize:
                    characters.append(text_line)

            # Slice out the characters that match the analyzer results.
            for i in range(0, len(analyzer_results)):
                start = analyzer_results[i].start
                end = analyzer_results[i].end
                analyzed_character_sets.append({"characters": ''.join(characters[start:end]), "entity_type": analyzer_results[i].entity_type})
                characters_2.append(characters)
                start_lst.append(start)
                end_lst.append(end)
            characters_final.append(characters)
df = pd.DataFrame.from_records(analyzed_character_sets).groupby(['entity_type'])['characters'].apply(list)
new_person = []
for x in df.PERSON:
  if '\n' in x:
    for y in x.split('\n'):
      new_person.append(y)
  else:
    new_person.append(x)
df.PERSON = new_person
data = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in df.items() ]))
print(data)




    DATE_TIME      IN_PAN      LOCATION MEDICAL_LICENSE           PERSON  \
0  1982-02-22  Rogersfurt      TN 48511       CV6642133      James Clark   
1  1971-04-20  withdrawal  West Gabriel       EX4578247  Kathryn Alvarez   
2  2018-11-19         NaN           NaN             NaN  Nathaniel Smith   
3  2007-06-17         NaN           NaN             NaN      North Aaron   
4         NaN         NaN           NaN             NaN    Billy Parrish   

        PHONE_NUMBER  
0       805-454-3206  
1       668.951.1735  
2  971.439.0527x7186  
3                NaN  
4                NaN  


**Transform & Anonymize Data**

In [27]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=data)
metadata.update_column(
    column_name='DATE_TIME',
    sdtype='datetime',
    datetime_format='%Y-%m-%d')
metadata.update_column(
    column_name='MEDICAL_LICENSE',
    sdtype='bban')
metadata.update_column(
    column_name='PERSON',
    sdtype='name')
metadata.update_column(
    column_name='PHONE_NUMBER',
    sdtype='phone_number')
synthesizer = HyperTransformer()
synthesizer.detect_initial_config(data)
synthesizer.update_sdtypes(column_name_to_sdtype={
  'DATE_TIME': 'datetime',
  'MEDICAL_LICENSE': 'pii',
  'PERSON': 'pii',
  'PHONE_NUMBER': 'pii'
})
synthesizer.update_transformers(column_name_to_transformer={
    'MEDICAL_LICENSE': PseudoAnonymizedFaker(provider_name='bank', function_name='bban'),
    'PERSON': PseudoAnonymizedFaker(provider_name='person', function_name='name'),
    'PHONE_NUMBER': PseudoAnonymizedFaker(provider_name='phone_number', function_name='phone_number'),
})
synthesizer.fit(data)
med_license = synthesizer.get_config()['transformers']['MEDICAL_LICENSE']
paf = synthesizer.get_config()['transformers']['PERSON']
phone_number = synthesizer.get_config()['transformers']['PHONE_NUMBER']

pii_k = []
pii_v = []
for k, v in med_license.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in paf.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)
for k, v in phone_number.get_mapping().items():
    pii_k.append(k)
    pii_v.append(v)

pii_k = np.array(pii_k)
pii_v = np.array(pii_v)

characters_final = ''.join([item for sublist in characters_final for item in sublist])
for i in range(0, len(characters_2)):
  pii_1 = ''.join(characters_2[i][start_lst[i]:end_lst[i]])
  if pii_1 in characters_final:
    value_1 = pii_v[np.argwhere(pii_1==pii_k).flatten()]
    if value_1.size > 0:
      characters_final = characters_final.replace(pii_1, value_1[0])

#Final Anonymized string which hides PII
print(characters_final)



DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
Robert Hunt
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 274.357.8632x2124
Can you please provide an update for the debit card for the checking account number
IVTXKT500282463110248943623?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Debbie Byrd
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: 001-591-758-3596x3282
Phone
Where is the card I ordered for my bank account number EINOGH868803981530131508389?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Bryan Harris
5271 Bolton Cliffs Suite 617
Colleen Williams, TN 48511
Phone No.: (705)540-8087
Is it confirmed if I have a new card coming in the mail for the checking account number
QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Andrea Lawrence
2516 Sara Mountains
West G

**Reverse Transformation of PII**

In [28]:
characters_original = characters_final
for i in range(0, len(pii_v)):
  characters_original = characters_original.replace(pii_v[i], pii_k[i])
print(characters_original)

DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Dear Madam/Sir,
James Clark
13002 Roberts Mountain Apt. 930
Beckshire, GU 38939
Phone No.: 805-454-3206
Can you please provide an update for the debit card for the checking account number
IVCV66421338943623?
Thanks,
Dated: 1982-02-22
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Hello,
Kathryn Alvarez
31253 Smith Bypass Suite 355
Rogersfurt, IA 59691
Phone No.: 668.951.1735
Phone
Where is the card I ordered for my bank account number EIEX45782471508389?
Best,
Dated: 1971-04-20
DEBIT CARD REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Nathaniel Smith
5271 Bolton Cliffs Suite 617
North Aaron, TN 48511
Phone No.: 971.439.0527x7186
Is it confirmed if I have a new card coming in the mail for the checking account number
QQQS27528544402592?
Best,
Dated: 2018-11-19
ACCOUNT WITHDRAWAL REQUEST
To:
The Branch Manager
Bank of America
Good Morning,
Billy Parrish
2516 Sara Mountains
West Gabriel, NY 17877
Phone No.: 

**Extract questions from anonymized string for use in LLM**

In [53]:
words = characters_final

questions = []
while True:
  sub1 = 'No.:'
  sub2 = '?'

  idx1 = words.find(sub1)
  if idx1 == -1:
    break
  idx2 = words.find(sub2)

  for i in range(0, len(words)):
    res = ''
    for idx in range(idx1 + len(sub1) + 1, idx2):
        res = res + words[idx]
  res = ' '.join(res.split('\n')[1:])
  questions.append(res)
  words = words[idx2+1:]

questions = np.array(questions)
labels = np.array(['Sure.', 'It is on its way.', 'Yes', 'Yes'])

fine_tuning_training_set = pd.DataFrame({'text':questions, 'label':labels})
print(fine_tuning_training_set)



                                                text              label
0  Can you please provide an update for the debit...              Sure.
1  Phone Where is the card I ordered for my bank ...  It is on its way.
2  Is it confirmed if I have a new card coming in...                Yes
3  Can you prepare a withdrawal request in the am...                Yes


**LLM Model**

In [56]:


# load pre-trained language model and tokenizer
model_name = "prajjwal1/bert-tiny"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# prepare data for fine-tuning
train_texts = fine_tuning_training_set.text.to_numpy().tolist()
train_labels = fine_tuning_training_set.label.to_numpy().tolist()
print(train_texts)
print(train_labels)

input_ids = tokenizer(
    train_texts,
    return_tensors="pt", padding=True, truncation=True,
).input_ids

labels = tokenizer(
    train_texts,
    return_tensors="pt", padding=True, truncation=True,
).input_ids

print('training...')

# the forward function automatically creates the correct decoder_input_ids
loss = model(input_ids=input_ids, labels=labels).loss
print('loss')
print(loss)

banking_question_example = (
    "Is my debit card on its way?"
)
input_ids = tokenizer(banking_question_example, return_tensors="pt", padding=True, truncation=True).input_ids

# autoregressively generate summary (uses greedy decoding by default)
generated_ids = model.generate(input_ids)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print('example prompt')
print(banking_question_example)
print(generated_text)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.1.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.1.crossattention.self.value.bias', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.self.query.bias', 'bert.encoder

['Can you please provide an update for the debit card for the checking account number IVTXKT500282463110248943623', 'Phone Where is the card I ordered for my bank account number EINOGH868803981530131508389', 'Is it confirmed if I have a new card coming in the mail for the checking account number QQQS27528544402592', 'Can you prepare a withdrawal request in the amount of $15,000 from my checking account number TTVY24567636706751']
['Sure.', 'It is on its way.', 'Yes', 'Yes']
training...
loss
tensor(10.6331, grad_fn=<NllLossBackward0>)


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


example prompt
Is my debit card on its way?
) ( ( ) ) ) ( ( ( ( ( ) ) ) ( ( ( ( (
