In [0]:
%sql
-- Create or replace streaming_inbox table (append-only Delta)
CREATE OR REPLACE TABLE streaming_inbox (
  message_id STRING,
  message_body STRING
);

INSERT INTO streaming_inbox VALUES
  ('s_msg1', 'Hello, this is a test message!'),
  ('s_msg2', 'Check out our new streaming offer'),
  ('s_msg3', 'SPAM: You have won a prize, click here!');

-- Create or replace incoming_messages table
CREATE OR REPLACE TABLE incoming_messages (
  message_id STRING,
  message_body STRING
);

INSERT INTO incoming_messages VALUES
  ('inc_msg1', 'Welcome to our service'),
  ('inc_msg2', 'SPAM: Win a free vacation now!'),
  ('inc_msg3', 'Reminder: meeting at 3 PM');

-- Create or replace my_documents table
CREATE OR REPLACE TABLE my_documents (
  document_id STRING,
  text_column STRING
);

INSERT INTO my_documents VALUES
  ('doc1', 'Databricks AI Query allows you to run LLM inferences at scale.'),
  ('doc2', 'Batch and streaming modes are both supported by AI Query.'),
  ('doc3', 'This is a short text that needs summarization.');

In [0]:
# Verify the inserted data
print("===== streaming_inbox sample rows =====")
spark.sql("SELECT * FROM streaming_inbox").show()

print("===== incoming_messages sample rows =====")
spark.sql("SELECT * FROM incoming_messages").show()

print("===== my_documents sample rows =====")
spark.sql("SELECT * FROM my_documents").show()

In [0]:
%sql
SELECT
  message_id,
  ai_query(
    endpoint       => "databricks-meta-llama-3-1-8b-instruct",
    request        => CONCAT(
      "You are a strict spam classifier. The user wrote: '",
      REGEXP_REPLACE(message_body, "[^a-zA-Z0-9\\s:]", ""),
      "'. Reply ONLY in valid JSON with an integer field named spam_flag. 0=not spam, 1=spam."
    ),
    responseFormat => '{
      "type": "json_schema",
      "json_schema": {
        "name": "spam_detection",
        "schema": {
          "type": "object",
          "properties": {
            "spam_flag": {"type": "integer"}
          }
        }
      }
    }',
    failOnError    => true
  ) AS parsed_response
FROM incoming_messages;


In [0]:
from pyspark.sql.functions import expr, parse_json

# Read the streaming_inbox as a stream
df_stream = spark.readStream.table("streaming_inbox")

# Inference step: LLM spam detection with JSON schema
scored_stream = df_stream.withColumn(
    "parsed_response",
    expr("""
      ai_query(
        'databricks-meta-llama-3-1-8b-instruct',
        CONCAT(
          'You are a strict spam classifier. The user wrote: "',
          regexp_replace(message_body, '[^a-zA-Z0-9\\s:]', ''),
          '". Reply ONLY in valid JSON with an integer field named spam_flag. 0=not spam, 1=spam.'
        ),
        '{"type":"json_object","json_schema":{"name":"spam_detection","schema":{"type":"object","properties":{"spam_flag":{"type":"integer"}}}}}',
        true
      )
    """)
)

display(scored_stream, checkpointLocation='/Volumes/workspace/default/checkpoints/1')

In [0]:
%sql
SELECT
  message_id,
  parse_json(ai_query(
    endpoint       => "databricks-meta-llama-3-1-8b-instruct",
    request        => CONCAT(
      "Analyze the text: '", 
      REGEXP_REPLACE(message_body, "[^a-zA-Z0-9\\s:]", ""), 
      "'. Reply in valid JSON with fields: label (string) and confidence (float)."
    ),
    responseFormat => '{
      "type": "json_schema",
      "json_schema": {
        "name": "text_classifier",
        "schema": {
          "type": "object",
          "properties": {
            "label": {"type": "string"},
            "confidence": {"type": "number"}
          }
        }
      }
    }',
    failOnError    => true
  )) AS classification_result
FROM incoming_messages;
