In [0]:
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting presidio_analyzer
  Using cached presidio_analyzer-2.2.354-py3-none-any.whl (92 kB)
Collecting presidio_anonymizer
  Using cached presidio_anonymizer-2.2.354-py3-none-any.whl (31 kB)
Collecting phonenumbers<9.0.0,>=8.12
  Using cached phonenumbers-8.13.34-py2.py3-none-any.whl (2.6 MB)
Collecting tldextract
  Using cached tldextract-5.1.2-py3-none-any.whl (97 kB)
Collecting pycryptodome>=3.10.1
  Using cached pycryptodome-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
Collecting requests-file>=1.4
  Using cached requests_file-2.0.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: phonenumbers, pycryptodome, requests-file, presidio_anonymizer, tldextract, presidio_analyzer
Successfully installed phonenumbers-8.13.34 presidio_analyzer-2.2.354 presidio_anonymizer-2.2.354 pycryptodome-3.20.0 requests-file-2.0.0 tldextract-5.1.2
[4

In [0]:
%pip install openai --upgrade
%pip install databricks-genai
%pip install databricks-genai-inference
%pip install mlflow

dbutils.library.restartPython()

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting openai
  Using cached openai-1.17.1-py3-none-any.whl (268 kB)
Collecting typing-extensions<5,>=4.7
  Using cached typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Collecting httpx<1,>=0.23.0
  Using cached httpx-0.27.0-py3-none-any.whl (75 kB)
Collecting httpcore==1.*
  Using cached httpcore-1.0.5-py3-none-any.whl (77 kB)
Collecting h11<0.15,>=0.13
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: typing-extensions, h11, httpcore, httpx, openai
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.4.0
    Not uninstalling typing-extensions at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-66aaf5d2-c2e1-4dfe-9332-cebd23419596
    Can't uninstall 'typing_extensions'. No files were found to uninstall.
  Attempting uninstall: open

In [0]:
import pprint
import os
import pandas as pd
from openai import OpenAI
import mlflow
import json
import re

databricks_token = mlflow.utils.databricks_utils.get_databricks_host_creds().token

In [0]:
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalyzerEngine, RecognizerRegistry
import mlflow
from openai import OpenAI
from typing import List

class DBRXRecognizer(EntityRecognizer):
    def __init__(self, model="databricks-dbrx-instruct", api_key=None, supported_entities=None):
        self.model = model
        self.api_key = api_key
        self.supported_language = "en" 
        self.supported_entities = supported_entities if supported_entities else self.ENTITIES
        self.client = OpenAI(
            api_key=self.api_key,
            base_url="https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints"
        )
        
        super().__init__(
            supported_entities=self.supported_entities,
            supported_language=self.supported_language,
            name="DBRX Recognizer",
        )

    def load(self) -> None:
        """Load the model, not used as model is loaded during initialization."""
        pass

    def get_supported_entities(self) -> List[str]:
        """Return supported entities by this model."""
        return self.supported_entities

    def analyze(self, text, entities, language="en", nlp_artifacts=None) -> List[RecognizerResult]:
        prompt = f"""
            Your role is to identify and list personally identifiable information such as names, locations, and organizations in the given text.

            Instructions:

            Your response should be an array of valid json objects with findings and nothing else. No commentary or additional explanations. 
            Here is an example of how you should respond: [{{"type": "PERSON", "start": "10", "end": "14", "score": "1.0"}}, {{"type": "LOCATION", "start": "27", "end": "37", "score": "1.0"}}]
            The score is your level of confidence for the detected personally identifiable information.
            Provide nothing else in the response other than the findings array, no commentary, explanations, or anything other than array response.
            input: my name is juan and I live in New Jersey
            output: [{{"type": "PERSON", "start": "12", "end": "16", "score": "1.0"}}, {{"type": "LOCATION", "start": "27", "end": "37", "score": "1.0"}}]
            input: {{text}}
            output: 
        """

        response = self.client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI system helping detect, classify, and anonymize sensitive PII data"},
                {"role": "user", "content": prompt}
            ],
            model=self.model,
            max_tokens=1024
        )

        detected_entities = self.parse_response(response)
        return detected_entities


    def parse_response(self, response):

        results = []
        if response.choices and len(response.choices) > 0:
            try:
                completion_text = response.choices[0].message.content

                print("completion_text -> \n")
                print(completion_text)

                data = json.loads(completion_text)
                
                print("data -> \n")
                print(data)

                # Iterate over each item in the parsed JSON data
                for item in data:
                    entity_type = item.get("type")
                    start_pos = item.get("start")
                    end_pos = item.get("end")
                    score = item.get("score")

                    # Convert start_pos and end_pos to integers
                    if start_pos is not None:
                        start_pos = int(start_pos)
                    if end_pos is not None:
                        end_pos = int(end_pos)

                    # Validate all required fields are present
                    if all([entity_type, start_pos is not None, end_pos is not None, score is not None]):
                        result = RecognizerResult(
                            entity_type=entity_type,
                            start=start_pos,
                            end=end_pos,
                            score=score
                        )
                        results.append(result)

            except json.JSONDecodeError:
                print("Error decoding JSON from response")

        return results



In [0]:
# Usage example
api_key = mlflow.utils.databricks_utils.get_databricks_host_creds().token
supported_entities = ["PERSON", "LOCATION", "ORGANIZATION"]

openai_recognizer = DBRXRecognizer(model="databricks-dbrx-instruct", api_key=api_key, supported_entities=supported_entities)

registry = RecognizerRegistry()
registry.add_recognizer(openai_recognizer)

analyzer = AnalyzerEngine(registry=registry)

text = "Hello, my name is Juan and I live in New Jersey and work for Databricks."
results = analyzer.analyze(text=text, language="en", return_decision_process=True)

print("final results -> \n ")
print(results)

parse_response -> 

ChatCompletion(id='5ac7936a-f890-445b-93c2-65669cc45752', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='[{"type": "PERSON", "start": "<start_index_of_name>", "end": "<end_index_of_name>", "score": "0.9"}, {"type": "LOCATION", "start": "<start_index_of_location>", "end": "<end_index_of_location>", "score": "0.9"}]\n\nPlease note that the start and end indices should correspond to the actual start and end positions of the PII data in the input text. Also, the confidence score is set to 0.9 as an example, please adjust it based on your actual confidence level.\n\nFor example, if the input text is "Hi, I am John from New York", the output should be:\n\n[{"type": "PERSON", "start": "7", "end": "11", "score": "0.9"}, {"type": "LOCATION", "start": "20", "end": "27", "score": "0.9"}]\n\nWhere "John" is the name and "New York" is the location.', role='assistant', function_call=None, tool_calls=None))], created=1713189971