In [None]:
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

In [None]:
%pip install openai --upgrade
%pip install databricks-genai
%pip install databricks-genai-inference
%pip install mlflow

dbutils.library.restartPython()

In [None]:
import pprint
import os
import pandas as pd
from openai import OpenAI
import mlflow
import json
import re

databricks_token = mlflow.utils.databricks_utils.get_databricks_host_creds().token

In [None]:
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalyzerEngine, RecognizerRegistry
import mlflow
from openai import OpenAI
from typing import List

class DBRXRecognizer(EntityRecognizer):
    def __init__(self, model="databricks-dbrx-instruct", api_key=None, supported_entities=None):
        self.model = model
        self.api_key = api_key
        self.supported_language = "en" 
        self.supported_entities = supported_entities if supported_entities else self.ENTITIES
        self.client = OpenAI(
            api_key=self.api_key,
            base_url="https://e2-demo-field-eng.cloud.databricks.com/serving-endpoints"
        )
        
        super().__init__(
            supported_entities=self.supported_entities,
            supported_language=self.supported_language,
            name="DBRX Recognizer",
        )

    def load(self) -> None:
        """Load the model, not used as model is loaded during initialization."""
        pass

    def get_supported_entities(self) -> List[str]:
        """Return supported entities by this model."""
        return self.supported_entities

    def analyze(self, text, entities, language="en", nlp_artifacts=None) -> List[RecognizerResult]:
        prompt = f"""
            Your role is to identify and list personally identifiable information such as names, locations, and organizations in the given text.

            Instructions:

            Your response should be an array of valid json objects with findings and nothing else. No commentary or additional explanations. 
            Here is an example of how you should respond: [{{"type": "PERSON", "start": "10", "end": "14", "score": "1.0"}}, {{"type": "LOCATION", "start": "27", "end": "37", "score": "1.0"}}]
            The score is your level of confidence for the detected personally identifiable information.
            Provide nothing else in the response other than the findings array, no commentary, explanations, or anything other than array response.
            input: my name is juan and I live in New Jersey
            output: [{{"type": "PERSON", "start": "12", "end": "16", "score": "1.0"}}, {{"type": "LOCATION", "start": "27", "end": "37", "score": "1.0"}}]
            input: {{text}}
            output: 
        """

        response = self.client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an AI system helping detect, classify, and anonymize sensitive PII data"},
                {"role": "user", "content": prompt}
            ],
            model=self.model,
            max_tokens=1024
        )

        detected_entities = self.parse_response(response)
        return detected_entities


    def parse_response(self, response):

        results = []
        if response.choices and len(response.choices) > 0:
            try:
                completion_text = response.choices[0].message.content

                print("completion_text -> \n")
                print(completion_text)

                data = json.loads(completion_text)
                
                print("data -> \n")
                print(data)

                # Iterate over each item in the parsed JSON data
                for item in data:
                    entity_type = item.get("type")
                    start_pos = item.get("start")
                    end_pos = item.get("end")
                    score = item.get("score")

                    # Convert start_pos and end_pos to integers
                    if start_pos is not None:
                        start_pos = int(start_pos)
                    if end_pos is not None:
                        end_pos = int(end_pos)

                    # Validate all required fields are present
                    if all([entity_type, start_pos is not None, end_pos is not None, score is not None]):
                        result = RecognizerResult(
                            entity_type=entity_type,
                            start=start_pos,
                            end=end_pos,
                            score=score
                        )
                        results.append(result)

            except json.JSONDecodeError:
                print("Error decoding JSON from response")

        return results



In [None]:
# Usage example
api_key = mlflow.utils.databricks_utils.get_databricks_host_creds().token
supported_entities = ["PERSON", "LOCATION", "ORGANIZATION"]

openai_recognizer = DBRXRecognizer(model="databricks-dbrx-instruct", api_key=api_key, supported_entities=supported_entities)

registry = RecognizerRegistry()
registry.add_recognizer(openai_recognizer)

analyzer = AnalyzerEngine(registry=registry)

text = "Hello, my name is Juan and I live in New Jersey and work for Databricks."
results = analyzer.analyze(text=text, language="en", return_decision_process=True)

print("final results -> \n ")
print(results)

In [None]:
import re
import xml.etree.ElementTree as ET

# Example XML string with arbitrary text before it

full_string = """
Some arbitrary text here
<answer>
    <person>
        <name>John</name>
        <age>30</age>
        <address>123 Main St</address>
    </person>
    <person>
        <name>Alice</name>
        <age>25</age>
        <address>456 Elm St</address>
    </person>
</answer>
"""

# Use regular expression to extract XML content
xml_match = re.search(r'<root>.*</root>', full_string, re.DOTALL)
if xml_match:
    xml_content = xml_match.group(0)

    # Parse the extracted XML content
    root = ET.fromstring(xml_content)

    # Iterate over each 'person' element and extract data
    for person in root.findall('person'):
        name_element = person.find('name')
        age_element = person.find('age')
        if name_element is not None and age_element is not None:
            name = name_element.text
            age = age_element.text
            print(f"Name: {name}, Age: {age}")