###Creating initial resources:
- source data
- pipelines
- dashboards
- genie space
- databricks app


In [0]:
%run ../00.set_variables

In [0]:
# Create a catalog, schema and volume

spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.{volume_name}")

# Grant all permissions on the catalog to all account users (crude but will avoid downstream issues)
spark.sql(f"GRANT ALL PRIVILEGES ON CATALOG {catalog_name} TO `account users`")

In [0]:
#Create files from kaggle dataset in volumes

import requests
import zipfile
import io

def create_mock_plant_equipment():
    data = [
        ("Spectrophotometer", 1, True),
        ("pH Meter", 2, True),
        ("Oven", 3, True),
        ("Remote Arm", 4, False),
        ("Thermometer", 5, None),
    ]
    
    columns = ["name", "id", "is_active"]
    df_equipment = spark.createDataFrame(data, columns)
    
    return df_equipment

# Download the zip file
url = "https://www.kaggle.com/api/v1/datasets/download/edumagalhaes/quality-prediction-in-a-mining-process"
response = requests.get(url)
zip_file = zipfile.ZipFile(io.BytesIO(response.content))

# Extract the CSV file to a UC volume
csv_filename = "MiningProcess_Flotation_Plant_Database.csv"
zip_file.extract(csv_filename, f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/")

# Define the file path
file_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/MiningProcess_Flotation_Plant_Database.csv"

# Read the CSV file into a DataFrame
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(file_path)

from pyspark.sql.functions import col, hour, when, concat_ws, lit, avg
from pyspark.sql import functions as F
import random
import pandas as pd

# Rename columns to remove invalid characters
df = df.select([F.col(col).alias(col.replace(" ", "_").replace("%", "Percent")) for col in df.columns])

for col in df.columns:
    if col != "date":
        df = df.withColumn(col, F.regexp_replace(col, r",", ".").cast("double"))

flotation_columns = [
    "date",
    "Starch_Flow",
    "Amina_Flow", 
    "Ore_Pulp_Flow",
    "Ore_Pulp_pH", 
    "Ore_Pulp_Density", 
    "Flotation_Column_01_Air_Flow", 
    "Flotation_Column_02_Air_Flow",
    "Flotation_Column_03_Air_Flow",
    "Flotation_Column_04_Air_Flow",
    "Flotation_Column_05_Air_Flow",
    "Flotation_Column_06_Air_Flow",
    "Flotation_Column_07_Air_Flow",
    "Flotation_Column_01_Level",
    "Flotation_Column_02_Level",
    "Flotation_Column_03_Level",
    "Flotation_Column_04_Level",
    "Flotation_Column_05_Level",
    "Flotation_Column_06_Level",
    "Flotation_Column_07_Level",
]
lab_data_columns = [
    "date",
    "Percent_Iron_Feed",
    "Percent_Silica_Feed",
    "Percent_Iron_Concentrate",
    "Percent_Silica_Concentrate",
]
df_flotation = df.select(*flotation_columns)
df_flotation.write.format("parquet") \
    .mode("overwrite") \
    .save(f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/flotation_data/")

df_lab = df.select(*lab_data_columns)

df_lab_hourly = df_lab.groupBy("date").agg(
    avg("Percent_Iron_Feed").alias("Percent_Iron_Feed"),
    avg("Percent_Silica_Feed").alias("Percent_Silica_Feed"),
    avg("Percent_Iron_Concentrate").alias("Percent_Iron_Concentrate"),
    avg("Percent_Silica_Concentrate").alias("Percent_Silica_Concentrate")
)

# Mock shift operators for PII demo

# 1. List of fake operator names
operator_names = [
    "Alice Johnson", "Ben Carter", "Cindy Lee", "David Smith", "Emma Wright",
    "Frank Miller", "Grace Kim", "Henry Jones", "Isla Clarke", "Jack White"
]

# Broadcast the list and create a shift ID based on date and shift
df_with_shift = df_lab_hourly.withColumn("shift_type", when(
    (hour("date") >= 6) & (hour("date") < 18), "day"
).otherwise("night"))

# Create a shift identifier (e.g., "2024-05-13_day")
df_with_shift = df_with_shift.withColumn("shift_id",
    concat_ws("_", F.to_date("date"), F.col("shift_type"))
)

# Get distinct shifts
distinct_shifts = df_with_shift.select("shift_id").distinct().collect()

# Assign a random operator to each shift
shift_operator_map = {row["shift_id"]: random.choice(operator_names) for row in distinct_shifts}

# Convert to a DataFrame for joining
shift_df = spark.createDataFrame(shift_operator_map.items(), ["shift_id", "operator_name"])

# Join operator name back to the main DataFrame
df_with_operator = df_with_shift.join(shift_df, on="shift_id", how="left").drop("shift_id")

df_with_operator.write.format("parquet") \
    .mode("overwrite") \
    .save(f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/lab_data_hourly/")

#mock equipment with a None isactive field that we can use to demo expecations
df_equipment = create_mock_plant_equipment()
df_equipment.write.format("parquet") \
    .mode("overwrite") \
    .save(f"/Volumes/{catalog_name}/{schema_name}/{volume_name}/equipment/")

#display(df_with_operator)
#display(df_flotation)


In [0]:
from pathlib import Path

current_notebook_path = dbutils.entry_point.getDbutils().notebook().getContext().notebookPath().get()
current_notebook_dir = Path(current_notebook_path).parent.parent
current_notebook_dir = str(current_notebook_dir)
print(f"current notebook location: {current_notebook_dir}")

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.jobs import JobSettings

job_name = f"deploy-iops-demo-{schema_name}"
notebook_path_1 = "/Workspace/Users/your.name@databricks.com/my-notebook"
notebook_path_2 = "/Workspace/Users/your.name@databricks.com/my-second-notebook"
task_key_1 = "my-task"
task_key_2 = "my-second-task"

job_settings = JobSettings.from_dict({
    "name": job_name,
    "tasks": [
        {
            "task_key": "create-pipeline-and-ingest",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/01.create_pipeline",
                "source": "WORKSPACE"
            }
        },
        {
            "task_key": "create-dashboard",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/02.create_dashboard",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "create-pipeline-and-ingest"}]
        },
        {
            "task_key": "create-deployment-job",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/model_deploy_jobs/create-deployment-job ",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "create-dashboard"}]
        },
        {
            "task_key": "run-governance-notebook",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/notebooks/01b. Unity Catalog, Governance and Auditability",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "create-deployment-job"}]
        },
        {
            "task_key": "run-eda-featurestore-notebook",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/notebooks/02. EDA and Feature Store",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-governance-notebook"}]
        },
        {
            "task_key": "run-model-creation-notebook",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/notebooks/03. Model Training and Experimentation",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-eda-featurestore-notebook"}]
        },
        {
            "task_key": "create-genie-space",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/03.create_genie_space",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-model-creation-notebook"}]
        },
        {
            "task_key": "create-app",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/04.create_app",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-model-creation-notebook"}]
        },
        {
            "task_key": "create-serving-endpoints",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/05.create_serving_endpoints ",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-model-creation-notebook"}]
        },
        {
            "task_key": "run-optimization-notebook",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/notebooks/05. Optimisation",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-model-creation-notebook"}]
        },
        {
            "task_key": "enable-anomaly-detection-classification",
            "notebook_task": {
                "notebook_path": f"{current_notebook_dir}/demo_setup/06.enable_data_classification_and_anomaly_detection",
                "source": "WORKSPACE"
            },
            "depends_on": [{"task_key": "run-model-creation-notebook"}]
        }
    ]
})

w = WorkspaceClient()
job = w.jobs.create(**job_settings.as_shallow_dict())
print(f"View the job at {w.config.host}/#job/{job.job_id}")

In [0]:
import time

run = w.jobs.run_now(job_id=job.job_id)
run_id = run.run_id

while True:
    run_status = w.jobs.get_run(run_id=run_id)
    life_cycle_state = run_status.state.life_cycle_state
    result_state = run_status.state.result_state
    print(f"View the job run at {w.config.host}/#job/{job.job_id}/run/{run_id}")
    print(f"Run ID is still going: {run_id}, Life Cycle State: {life_cycle_state}, Result State: {result_state}")
    if result_state is not None:
        break
    time.sleep(30)