**Notebook parameters**

In [None]:
#service principal
tenant_id = ""
client_id = ""
client_secret = ""
#warehouse
warehouse_name = ""
warehouse_schema = ""
warehouse_recommendations_table = "recommendations_advisor"
warehouse_consumption_table = "consumoAzure_agg"

#config
top_number_subscriptions = 5 # Top N suscripciones que serán consultadas a la API de advisor, ordenadas por el monto de consumo
min_amount_cost_optimization = 5000 # Monto anual minimo de recomendaciones de optimizacion de costos


**Libraries and configuration of the notebook**

Modify variables of Lakehouse and/or Warehouse if required

In [None]:
# ==========================
# BIBLIOTECAS
# ==========================

import requests
import json
import datetime
import re
import time
from azure.identity import ClientSecretCredential
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, DateType, FloatType
from pyspark.sql.functions import lit,col,current_date, year, month, lower,sum,desc,get_json_object
import com.microsoft.spark.fabric

# ==========================
# CONFIGURACIÓN
# ==========================


# Inicializar Spark
spark = SparkSession.builder.getOrCreate()

# ==========================
# 1. Obtener token de autenticación
# ==========================

try:
    credential = ClientSecretCredential(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret)
    token = credential.get_token("https://management.azure.com/.default").token
except Exception as e:
    raise Exception(f"Error en autenticación: {e}")

**Initialize export of the data using the generateCostDetailsReport API and log the status**

In [None]:
# ==========================
# Esquema de la tabla de recomendaciones de advisor
# ==========================

schema_advisor = StructType([
    StructField("id", StringType(), True),
    StructField("suscripcionId", StringType(), True),
    StructField("suscripcion", StringType(), True),
    StructField("grupoRecursos", StringType(), True),
    StructField("categoria", StringType(), True),
    StructField("subcategoria", StringType(), True),    
    StructField("impacto", StringType(), True),
    StructField("registroImpactado", StringType(), True),
    StructField("IdInstancia", StringType(), True),
    StructField("ultimaActualizacion", StringType(), True), 
    StructField("problema", StringType(), True),
    StructField("solucion", StringType(), True),
    StructField("implicacionCostoRecomendacion", StringType(), True),
    StructField("nivelMadurez", StringType(), True),
    StructField("montoAhorroAnual", FloatType(), True),
    StructField("extendedProperties_json", StringType(), True),
    StructField("resourceId", StringType(), True)
])

In [None]:
# ==========================
# 2. Obtener suscripciones con mayor consumo
# ==========================

#Leer la tabla de consumo
consumption_df = spark.read.synapsesql(f"{warehouse_name}.{warehouse_schema}.{warehouse_consumption_table}")

# Agrupar por suscripcionID y suscripcion, sumar ACR
consumption_agg_df = consumption_df.groupBy("suscripcionID","suscripcion").agg(sum("ACR").alias("totalACR"))

# Ordenar por totalACR descendente y tomar los primeros top_number_subscriptions
result_df = consumption_agg_df.orderBy(desc("totalACR")).limit(top_number_subscriptions)

display(result_df)
    

In [None]:
# ==========================
# 3. Iterar sobre cada ID de suscripcion para obtener sus respectivas recomendaciones
# ==========================

headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

# === Función para paginar la API (manejo de nextLink) ===
def fetch_all_recommendations(subscription_id: str,subscription: str, headers: dict):
    url = f"https://management.azure.com/subscriptions/{subscription_id}/providers/Microsoft.Advisor/recommendations?api-version=2025-01-01"
    results = []

    while url:
        resp = requests.get(url, headers=headers)
        if resp.status_code != 200:
            raise RuntimeError(f"Error {resp.status_code}: {resp.text}")
        payload = resp.json() or {}
        items = payload.get("value", [])
        results.extend(items)
        # Manejo de paginación
        url = payload.get("nextLink")
    return results

# === Aplanado seguro de cada recomendación ===
def normalize_recommendation(rec: dict, load_ts: datetime):
    # Nivel raíz
    rec_id = rec.get("id", "")
    rec_type = rec.get("type", "")
    rec_name = rec.get("name", "")

    props = rec.get("properties", {}) or {}

    # Campos de primer nivel en properties
    category = props.get("category", "")
    impact = props.get("impact", "")
    impactedField = props.get("impactedField", "")
    impactedValue = props.get("impactedValue", "")
    lastUpdated = props.get("lastUpdated", "")
    recommendationTypeId = props.get("recommendationTypeId", "")
    

    # shortDescription
    short_desc = props.get("shortDescription", {}) or {}
    problem = short_desc.get("problem", "")
    solution = short_desc.get("solution", "")

    # extendedProperties (mantener claves comunes; si hay más, serializamos)
    ext = props.get("extendedProperties", {}) or {}
    recommendationSubCategory = ext.get("recommendationSubCategory", "")
    recommendationCostImplication = ext.get("recommendationCostImplication", "")
    maturityLevel = ext.get("maturityLevel", "")
    recommendationOfferingId = ext.get("recommendationOfferingId", "")
    resiliencyExperience = ext.get("resiliencyExperience", "")
    resiliencyCostImpact = ext.get("resiliencyCostImpact", "")
    annualSavingsAmount = ext.get("annualSavingsAmount","")
    
    try:
        annualSavingsAmount = float(annualSavingsAmount) if annualSavingsAmount is not None else None
    except ValueError:
        annualSavingsAmount = None  # Si no se puede convertir, lo dejamos como None

    # Capturar el resto como JSON por si vienen nuevas claves
    extendedProperties_json = json.dumps(ext, ensure_ascii=False)

    # resourceMetadata
    rmeta = props.get("resourceMetadata", {}) or {}
    resourceId = rmeta.get("resourceId", "")
    resourceMetadata_json = json.dumps(rmeta, ensure_ascii=False)

    #Extrae el grupo de recursos de resourceID


    # Usando expresión regular para capturar el valor después de 'resourceGroups/'
    match = re.search(r"/resourceGroups/([^/]+)", resourceId)

    if match:
        resource_group = match.group(1)
    else:
        resource_group = "N/A"
        #print("No se encontró el grupo de recursos en el resourceId.")


    return {
        "id": rec_id,
        "suscripcionId": subscription_id,
        "suscripcion": subscription,
        "grupoRecursos": resource_group,
        "categoria": category,
        "subcategoria": recommendationSubCategory,        
        "impacto": impact,
        "registroImpactado": impactedField,
        "IdInstancia": impactedValue,
        "ultimaActualizacion": lastUpdated,
        "problema": problem,
        "solucion": solution,
        "implicacionCostoRecomendacion": recommendationCostImplication,
        "nivelMadurez": maturityLevel,
        "montoAhorroAnual":annualSavingsAmount,
        "extendedProperties_json": extendedProperties_json,
        "resourceId": resourceId
    }



# === Ejecutar extracción, normalización y escritura ===
load_ts = datetime.datetime.now()

# 1) DataFrame acumulador vacío con el esquema correcto
recommendations_df = spark.createDataFrame([], schema=schema_advisor)

# Iteramos sobre cada suscripcion 
for row in result_df.collect():
    subscription_id = row["suscripcionID"]
    subscription = row["suscripcion"]
    raw_recs = fetch_all_recommendations(subscription_id,subscription, headers)
    norm_rows = [normalize_recommendation(r, load_ts) for r in raw_recs]

    current_df = spark.createDataFrame(norm_rows, schema=schema_advisor)

    recommendations_df = recommendations_df.unionByName(current_df, allowMissingColumns=True)


In [None]:

# ==========================
# 4. Procesamiento de datos
# ==========================

#Quitar recomendaciones menos importantes para aspectos relacionados a costos
recommendations_df = recommendations_df.filter(
    ~(
        lower(col("grupoRecursos")).contains("databricks") |  # Si contiene 'databricks' , para grupo administrado de databricks
        ((col("categoria") == "Cost") & (col("montoAhorroAnual") < min_amount_cost_optimization)) |  # Si es Cost y ahorro < min_amount_cost_optimization
        (~col("impacto").isin(["Medium", "High"])) |  # Si el impacto NO es Medium ni High
        (col("categoria").isin(["OperationalExcellence", "Security"])) |  # Quitar recomendaciones de categorias de OperationalExcellence y Security
        (col("subcategoria") == "Other")  #Quitar recomendaciones de subCategoria Other
    )
)

#Extraer los campos region y SKU del extendedProperties_json
recommendations_df = recommendations_df.withColumn("sku", get_json_object(col("extendedProperties_json"), "$.sku")).withColumn("region", get_json_object(col("extendedProperties_json"), "$.region")).withColumn("recommendationMessage", get_json_object(col("extendedProperties_json"), "$.recommendationMessage"))



In [None]:

# ==========================
# 5. Escribir datos en tabla de warehouse
# ==========================

recommendations_df.write.mode("overwrite").synapsesql(f"{warehouse_name}.{warehouse_schema}.{warehouse_recommendations_table}")