In [None]:
%pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

In [None]:
%pip install openai --upgrade
# %pip install databricks-genai
%pip install databricks-genai-inference
# %pip install mlflow

dbutils.library.restartPython()

In [None]:
import pprint
import os
import pandas as pd
from openai import OpenAI
import mlflow
import json
import re
import xml.etree.ElementTree as ET

databricks_token = mlflow.utils.databricks_utils.get_databricks_host_creds().token

In [None]:
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalyzerEngine, RecognizerRegistry
import mlflow
from openai import OpenAI
from typing import List

class DBRXRecognizer(EntityRecognizer):
    def __init__(self, model="databricks-dbrx-instruct", api_key=None, supported_entities=None):
        self.model = model
        self.api_key = api_key
        self.supported_language = "en" 
        self.supported_entities = supported_entities if supported_entities else self.ENTITIES
        self.client = OpenAI(
            api_key=self.api_key,
            base_url="https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints"
        )
        
        super().__init__(
            supported_entities=self.supported_entities,
            supported_language=self.supported_language,
            name="DBRX Recognizer",
        )

    def load(self) -> None:
        """Load the model, not used as model is loaded during initialization."""
        pass

    def get_supported_entities(self) -> List[str]:
        """Return supported entities by this model."""
        return self.supported_entities
    
    # [{{"type": "PERSON", "start": "10", "end": "14", "score": "1.0"}}, {{"type": "LOCATION", "start": "27", "end": "37", "score": "1.0"}}]

    def analyze(self, text, entities, language="en", nlp_artifacts=None) -> List[RecognizerResult]:
        
        prompt = f"""
            Your role is to identify and list personally identifiable information such as names, locations, and organizations in the given text.

            Instructions:

            Valid types in the entity is PERSON, LOCATION, ORGANIZATION.
            PERSON: is for people 
            LOCATION: is physical/geographic locations
            ORGANIZATION: is names of entities such as companies, etc 

            Your response should be xml with findings and nothing else. No commentary or additional explanations. 
            Here is an example of how you should respond:
            <answer>
                <entities>
                    <entity>
                        <type>PERSON</type>
                        <start>10</start>
                        <end>14</end>
                        <score>1.0</score>
                    </entity>
                    <entity>
                        <type>LOCATION</type>
                        <start>27</start>
                        <end>37</end>
                        <score>1.0</score>
                    </entity>
                </entities>
            </answer>

            
            The score is your level of confidence for the detected personally identifiable information.
            Provide nothing else in the response other than the findings array, no commentary, explanations, or anything other than array response.
            input: my name is juan and I live in New Jersey, and I work at WHO
            output: 
            <answer>
            <entities>
                <entity>
                    <type>PERSON</type>
                    <start>12</start>
                    <end>16</end>
                    <item>juan</item>
                    <score>1.0</score>
                </entity>
                <entity>
                    <type>LOCATION</type>
                    <start>27</start>
                    <end>37</end>
                    <item>New Jersey</item>
                    <score>1.0</score>
                </entity>
                <entity>
                    <type>ORGANIZATION</type>
                    <start>12</start>
                    <end>16</end>
                    <item>WHO</item>
                    <score>1.0</score>
                </entity>
                </entities>
            </answer>
            input: {text}
            output: 
        """
        response = self.client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI system helping detect, classify, and anonymize sensitive PII data"},
                {"role": "user", "content": prompt}
            ],
            model=self.model,
            max_tokens=1024
        )

        detected_entities = self.parse_response(text, response)
        return detected_entities

    def find_start_end_all(self, input_string, search_string):
        matches = []
        # Perform case-insensitive search
        for match in re.finditer(re.escape(search_string), input_string, re.IGNORECASE):
            start_position = match.start()
            end_position = match.end()
            matches.append((start_position, end_position))
        return matches


    # Use regular expression to extract XML content
    def parse_response(self, input_text, response):
        completion_text = response.choices[0].message.content
        xml_match = re.search(r'<answer>.*</answer>', completion_text, re.DOTALL)
        if xml_match:
            xml_content = xml_match.group(0)
            root = ET.fromstring(xml_content)
            entities = {}
            for entity in root.findall('.//entity'):
                entitiy_key = f"{entity.find('type').text}/{entity.find('item').text}"
                entity_data = {
                    'type': entity.find('type').text,
                    'item': entity.find('item').text,
                    'score': float(entity.find('score').text)
                }
                entities[entitiy_key] = entity_data
            final_list = []
            for k, v in entities.items():
                occurrences = self.find_start_end_all(input_text, v["item"])
                for (start, end) in occurrences:
                    print({
                        "entity_type":v["type"],
                            "start":start,
                            "end":end,
                            "score": v["score"],
                            "analysis_explanation":f"Found item: {v['item']}"
                    })
                    final_list.append(RecognizerResult(
                            entity_type= v["type"],
                            start=start,
                            end=end,
                            score=v["score"],
                            analysis_explanation=f"Found item: {v['item']}"
                        ))
            return final_list
        return None

In [None]:
# Usage example
api_key = mlflow.utils.databricks_utils.get_databricks_host_creds().token
supported_entities = ["PERSON", "LOCATION", "ORGANIZATION"]

openai_recognizer = DBRXRecognizer(model="databricks-dbrx-instruct", api_key=api_key, supported_entities=supported_entities)

registry = RecognizerRegistry()
registry.add_recognizer(openai_recognizer)

analyzer = AnalyzerEngine(registry=registry)

# text = "Hello, my name is Juan and I live in New Jersey and work for Databricks."
text = "Hello, my name is Juan and I live in New Jersey and am playing baseball. Jenny is going to Starbucks too. Hmm maybe I wonder if Databricks has a Starbucks located inside it. What about Estee? Do they want some coffee. Toby seemed like he wanted some coffee."

results = analyzer.analyze(text=text, language="en", return_decision_process=True)

print("final results -> \n ")
print(results)

# Archive
