## **Boosting Data Workloads:** [MS Fabric Resource Profile Strategies](https://learn.microsoft.com/en-us/fabric/data-engineering/configure-resource-profile-configurations)

**Microsoft Fabric Resource Profiles** are predefined configurations designed to optimize Apache Spark workloads within the Microsoft Fabric platform. These profiles simplify Spark tuning by applying best-practice settings tailored to specific workload types, such as read-heavy analytics, write-heavy ingestion, or hybrid scenarios.

Benefits:
- ✅ Performance by default – Optimized settings out‑of‑the‑box 
- ✅ Flexibility – Choose or customize profiles per workload 
- ✅ Reduced overhead – Avoid trial‑and‑error tuning


#### Review the current Spark workload profile pre-configured in Microsoft Fabric.

In [1]:
spark.conf.get("spark.fabric.resourceProfile")

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 3, Finished, Available, Finished)

res9: String = writeHeavy


#### Retrieving Spark Configurations for the 'writeHeavy' Profile

In [2]:
from pyspark.sql import SparkSession
# Get the current Spark session
spark = SparkSession.builder.getOrCreate()

# Unified list of relevant Spark config keys across all profiles
config_keys = [
    "spark.sql.parquet.vorder.default",
    "spark.databricks.delta.optimizeWrite.enabled",
    "spark.databricks.delta.optimizeWrite.binSize",
    "spark.databricks.delta.optimizeWrite.partitioned.enabled",
    "spark.databricks.delta.stats.collect"
]

# List of profiles to display
profiles = ["writeHeavy"] # , "readHeavyForPBI", "readHeavyForSpark"

# Display current config values grouped by profile name
for profile in profiles:
    print(f"\n--- {profile} Profile ---")
    for key in config_keys:
        try:
            value = spark.conf.get(key)
            print(f"{key} = {value}")
        except Exception as e:
            print(f"{key} not set or not accessible: {e}")

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 5, Finished, Available, Finished)


--- writeHeavy Profile ---
spark.sql.parquet.vorder.default = false
spark.databricks.delta.optimizeWrite.enabled = None
spark.databricks.delta.optimizeWrite.binSize = 128
spark.databricks.delta.optimizeWrite.partitioned.enabled = true
spark.databricks.delta.stats.collect = true


#### Configure Spark Session for Read-Heavy Power BI Workloads

In [3]:
spark.conf.set("spark.fabric.resourceProfile", "readHeavyForPBI") #readHeavyForSpark

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 6, Finished, Available, Finished)

#### Validate the Spark workload profile currently set up in Fabric.

In [4]:
spark.conf.get("spark.fabric.resourceProfile")

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 7, Finished, Available, Finished)

'readHeavyForPBI'

#### Display Relevant Spark Configurations for Selected Profiles

In [5]:
fb_spk_profile=[]
fb_spk_profile = spark.conf.get("spark.fabric.resourceProfile")
print(fb_spk_profile)

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 8, Finished, Available, Finished)

readHeavyForPBI


#### Retrieving the new Spark Configurations for the 'readHeavyForPBI' Profile
- Notice: 
    - `spark.sql.parquet.vorder.default` is to `true`
    - `spark.databricks.delta.optimizeWrite.enabled` is  set to `true`
    - `spark.databricks.delta.optimizeWrite.binSize` is set to 1g

In [6]:
from pyspark.sql import SparkSession
# Get the current Spark session
spark = SparkSession.builder.getOrCreate()

# Unified list of relevant Spark config keys across all profiles
config_keys = [
    "spark.sql.parquet.vorder.default",
    "spark.databricks.delta.optimizeWrite.enabled",
    "spark.databricks.delta.optimizeWrite.binSize",
    "spark.databricks.delta.optimizeWrite.partitioned.enabled",
    "spark.databricks.delta.stats.collect"
]

# List of profiles to display
profiles = fb_spk_profile # , "readHeavyForPBI", "readHeavyForSpark"

# Display current config values grouped by profile name

print(f"\n--- {profile} Profile ---")
for key in config_keys:
    try:
        value = spark.conf.get(key)
        print(f"{key} = {value}")
    except Exception as e:
        print(f"{key} not set or not accessible: {e}")

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 9, Finished, Available, Finished)


--- writeHeavy Profile ---
spark.sql.parquet.vorder.default = true
spark.databricks.delta.optimizeWrite.enabled = true
spark.databricks.delta.optimizeWrite.binSize = 1g
spark.databricks.delta.optimizeWrite.partitioned.enabled = true
spark.databricks.delta.stats.collect = true


#### Alter Session-Level Spark Configuration for the Resource Profiles

In [ ]:
# we can alter a single setting
spark.conf.set("spark.databricks.delta.optimizeWrite.binSize", "3g")

spark.conf.get("spark.databricks.delta.optimizeWrite.binSize")

In [ ]:
# we can alter bulk settings

from pyspark.sql import SparkSession

# Get the current Spark session
spark = SparkSession.builder.getOrCreate()

# Define expected config values for the selected profile
resource_profile_settings = {
    "spark.sql.parquet.vorder.default": "false",
    "spark.databricks.delta.optimizeWrite.binSize": "1g"
}

# Set the active profile name (example assignment)
fb_spk_profile = "writeHeavy"  # Replace with actual profile name if needed

print(f"\nApplying '{fb_spk_profile}' profile settings:")
for key, value in resource_profile_settings.items():  # ✅ Fixed typo: .items()
    spark.conf.set(key, value)
    print(f"Set {key} = {value}")


In [7]:
from pyspark.sql import SparkSession
# Get the current Spark session
spark = SparkSession.builder.getOrCreate()

# Unified list of relevant Spark config keys across all profiles
config_keys = [
    "spark.sql.parquet.vorder.default",
    "spark.databricks.delta.optimizeWrite.enabled",
    "spark.databricks.delta.optimizeWrite.binSize",
    "spark.databricks.delta.optimizeWrite.partitioned.enabled",
    "spark.databricks.delta.stats.collect"
]

# List of profiles to display
profiles = fb_spk_profile # , "readHeavyForPBI", "readHeavyForSpark"

# Display current config values grouped by profile name

print(f"\n--- {profile} Profile ---")
for key in config_keys:
    try:
        value = spark.conf.get(key)
        print(f"{key} = {value}")
    except Exception as e:
        print(f"{key} not set or not accessible: {e}")

StatementMeta(, 9d547e54-7305-4748-8672-5a06440e38bb, 10, Finished, Available, Finished)

Error: <console>:1: error: ';' expected but '.' found.

#### Display all settings

In [ ]:
from pyspark import SparkConf
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Retrieve all default Spark configurations
default_conf = SparkConf().getAll()
default_df = pd.DataFrame(default_conf, columns=['Config', 'Default'])

# Fetch runtime values for each configuration key
runtime_df = pd.DataFrame({
    'Config': default_df['Config'],
    'Runtime_Value': [spark.conf.get(key) for key in default_df['Config']]
})

# Display combined configuration overview
display(runtime_df)

In [1]:
%%spark
spark.read.table("ecomsales2").count



StatementMeta(, 483a0ff0-f379-434c-af9d-854b27db5812, 3, Finished, Available, Finished)

res9: Long = 5000000


In [ ]:
%%spark
spark.read.table("ecomsales").queryExecution.optimizedPlan.stats
spark.read.table("ecomsales").queryExecution.optimizedPlan.stats.attributeStats

In [ ]:
# example of creating a custom resource profile
spark.conf.set(
    "spark.fabric.resourceProfile.conformanceCheck",
    """
    {
        "spark.sql.shuffle.partitions": "200",
        "spark.sql.adaptive.enabled": "true",
        "spark.sql.adaptive.shuffle.targetPostShuffleInputSize": "64MB",
        "spark.sql.broadcastTimeout": "600",

        "spark.sql.caseSensitive": "false",
        "spark.sql.parquet.filterPushdown": "true",
        "spark.sql.parquet.mergeSchema": "false",
        "spark.sql.files.ignoreCorruptFiles":  false,
        “ spark. sql.sources.partitionOverwriteMode”: “dynamic”,

        “ spark.executor.memoryOverhead”: “512”,
        “ spark. sql.execution.arrow.pyspark.enabled”: true,
        “ spark. sql.autoBroadcastJoinThreshold”: -1,

        “ spark. sql.queryExecutionListeners”: org.apache.spark.util.QueryExecutionListener”,
        “ spark.databricks.queryWatchdog.enabled: true,

        ” spark. sql.join.preferSortMergeJoin: true,
        ” spark.databricks.io.cache.enabled: true
    }
    """
)

image.png

In [ ]:
%%pyspark
# Retrieve the current resource profile setting
print("Current Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

# Set the resource profile to 'readHeavyForSpark' for the current session
spark.conf.set("spark.fabric.resourceProfile", "readHeavyForSpark")

# Confirm the updated resource profile setting
print("Updated Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

In [ ]:
%%sql
SELECT COUNT(1) AS NbrOfRows FROM LH01.dbo.ecomsales;


In [ ]:
%%sql
SELECT * FROM LH01.dbo.ecomsales limit 5;

In [ ]:
%%pyspark
# Set the resource profile to 'readHeavyForSpark' for the current session
spark.conf.set("spark.fabric.resourceProfile", "writeHeavy")

# Confirm the updated resource profile setting
print("Updated Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

In [1]:
print(1)

StatementMeta(, eaf27727-b3ad-4e1c-896c-7ab9cbfb0535, 3, Finished, Available, Finished)

1

In [3]:
%%pyspark
# Retrieve the current resource profile setting
print("Current Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

StatementMeta(, 025a9858-5dfb-4042-970b-bb8891b5e83c, 6, Finished, Available, Finished)

Current Resource Profile: writeHeavy


In [7]:
%%pyspark
sql_query="""
SELECT 
    Country,
    CAST(SUM(Amount) AS DECIMAL(12,2)) AS TotalAmount,
    RANK() OVER (ORDER BY SUM(Amount) DESC) AS Rank,
    DENSE_RANK() OVER (ORDER BY SUM(Amount) DESC) AS DenseRank,
    NTILE(4) OVER (ORDER BY SUM(Amount) DESC) AS Quartile
FROM 
    LH01.dbo.ecomsales2
GROUP BY 
    Country;
"""
output = spark.sql(sql_query)
display(output.limit(5))

StatementMeta(, 025a9858-5dfb-4042-970b-bb8891b5e83c, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, be355b29-a789-4a78-9277-e7975389ee97)

In [5]:
rdd.checkpoint()

StatementMeta(, 9d547e54-7305-4748-8672-5a06440e38bb, 8, Finished, Available, Finished)

Error: <console>:25: error: not found: value rdd

In [12]:
%%pyspark
# Set the resource profile to 'readHeavyForSpark' for the current session
spark.conf.set("spark.fabric.resourceProfile", "writeHeavy")

# Confirm the updated resource profile setting
print("Updated Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

StatementMeta(, d65860aa-c735-4637-a84a-f0f4e6e5f3ab, 15, Finished, Available, Finished)

Updated Resource Profile: writeHeavy


In [4]:
%%pyspark
# Set the resource profile to 'readHeavyForSpark' for the current session
spark.conf.set("spark.fabric.resourceProfile", "readHeavyForSpark")

# Confirm the updated resource profile setting
print("Updated Resource Profile:", spark.conf.get("spark.fabric.resourceProfile"))

StatementMeta(, 483a0ff0-f379-434c-af9d-854b27db5812, 7, Finished, Available, Finished)

Updated Resource Profile: readHeavyForSpark


In [5]:
from pyspark.sql import SparkSession
# Get the current Spark session
spark = SparkSession.builder.getOrCreate()

# Unified list of relevant Spark config keys across all profiles
config_keys = [
    "spark.sql.parquet.vorder.default",
    "spark.databricks.delta.optimizeWrite.enabled",
    "spark.databricks.delta.optimizeWrite.binSize",
    "spark.databricks.delta.optimizeWrite.partitioned.enabled",
    "spark.databricks.delta.stats.collect"
]

# List of profiles to display
profiles = ["readHeavyForSpark"] # , "readHeavyForPBI", "readHeavyForSpark"

# Display current config values grouped by profile name
for profile in profiles:
    print(f"\n--- {profile} Profile ---")
    for key in config_keys:
        try:
            value = spark.conf.get(key)
            print(f"{key} = {value}")
        except Exception as e:
            print(f"{key} not set or not accessible: {e}")

StatementMeta(, 483a0ff0-f379-434c-af9d-854b27db5812, 8, Finished, Available, Finished)


--- readHeavyForSpark Profile ---
spark.sql.parquet.vorder.default = false
spark.databricks.delta.optimizeWrite.enabled = true
spark.databricks.delta.optimizeWrite.binSize = 128
spark.databricks.delta.optimizeWrite.partitioned.enabled = true
spark.databricks.delta.stats.collect = true


In [7]:
%%pyspark
sql_query="""
SELECT 
    Country,
    CAST(SUM(Amount) AS DECIMAL(12,2)) AS TotalAmount,
    RANK() OVER (ORDER BY SUM(Amount) DESC) AS Rank,
    DENSE_RANK() OVER (ORDER BY SUM(Amount) DESC) AS DenseRank,
    NTILE(4) OVER (ORDER BY SUM(Amount) DESC) AS Quartile
FROM 
    LH01.dbo.ecomsales
GROUP BY 
    Country;
"""
output = spark.sql(sql_query)
display(output.unpersist())

StatementMeta(, 483a0ff0-f379-434c-af9d-854b27db5812, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 09b679fb-80b7-41c7-af16-342f353d2865)

In [ ]:
%%pyspark

# Clear Spark's in-memory cache to ensure fresh execution
spark.catalog.clearCache()

# Define and execute the SQL query with ranking functions
sql_query = """
SELECT 
    Country,
    CAST(SUM(Amount) AS DECIMAL(12,2)) AS TotalAmount,
    RANK() OVER (ORDER BY SUM(Amount) DESC) AS Rank,
    DENSE_RANK() OVER (ORDER BY SUM(Amount) DESC) AS DenseRank,
    NTILE(4) OVER (ORDER BY SUM(Amount) DESC) AS Quartile
FROM 
    LH01.dbo.ecomsales
GROUP BY 
    Country;
"""

output = spark.sql(sql_query)
display(output.limit(5))