# **Azure Synapse Spark Pool Configuration**

In [None]:
# Query Spark configuration

spark_executor_instances = spark.conf.get("spark.executor.instances")
print(f"spark.executor.instances {spark_executor_instances}")

spark_executor_cores = spark.conf.get("spark.executor.cores")
print(f"spark.executor.cores {spark_executor_cores}")

spark_executor_memory = spark.conf.get("spark.executor.memory")
print(f"spark.executor.memory {spark_executor_memory}")

spark_driver_memory = spark.conf.get("spark.driver.memory")
print(f"spark.driver.memory {spark_driver_memory}")

In [None]:
%%configure -f
{
    # You can get a list of valid parameters to config the session from https://github.com/cloudera/livy#request-body.
    "driverMemory": "28g", # Recommended values: ["28g", "56g", "112g", "224g", "400g", "472g"]
    "driverCores": 4, # Recommended values: [4, 8, 16, 32, 64, 80]
    "executorMemory": "28g",
    "executorCores": 4,
    "jars": ["abfs[s]: //<file_system>@<account_name>.dfs.core.windows.net/<path>/myjar.jar", "wasb[s]: //<containername>@<accountname>.blob.core.windows.net/<path>/myjar1.jar"],
    "conf":
    {
        # Example of standard spark property, to find more available properties please visit: https://spark.apache.org/docs/latest/configuration.html#application-properties.
        "spark.driver.maxResultSize": "10g",
        # Example of customized property, you can specify count of lines that Spark SQL returns by configuring "livy.rsc.sql.num-rows".
        "livy.rsc.sql.num-rows": "3000"
    }
}

In [None]:
# Libraries supported by Azure Synapse Analytics
import pkg_resources
for d in pkg_resources.working_set:
    print(d)

In [None]:
# 1. Generate a sample dictionary list with toy data:
data = [{"Category": 'A', "ID": 1, "Value": 121.44, "Truth": True},
        {"Category": 'B', "ID": 2, "Value": 300.01, "Truth": False},
        {"Category": 'C', "ID": 3, "Value": 10.99, "Truth": None},
        {"Category": 'E', "ID": 4, "Value": 33.87, "Truth": True}
        ]

# 2. Import and create a SparkSession:
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.getOrCreate()


# 3. Create a DataFrame using the createDataFrame method. Check the data type to confirm the variable is a DataFrame:
df = spark.createDataFrame(data)
type(df)

In [None]:
# Explore the data
display(df)

In [None]:
pd_df = df.toPandas()

In [None]:
pd_df.iloc[0].tolist()

In [None]:
# Explore Spark command
# Read parquet file using spark

# Data Eploration & Assumptions

- ## Assumptions about the contents of the data
- ## Assumptions about the distribution of the data

In [None]:
%%pyspark
# Read Data From Azure Data Lake Storage
DailyTop5Sales = spark.read.load('abfss://root@adlesilabs.dfs.core.windows.net/demofiles/csv/PurchaseOrderDetail.csv'
               , format='csv'
                 )

display(DailyTop5Sales.limit(3))

## Take a look to verify that, this is not the correct struct schema
- timestamp vs date data type
- need to do some more works to reshare the data

In [None]:
DailyTop5Sales.printSchema()

In [None]:
from pyspark.sql.functions import col, to_date,sum,avg,max,count
from pyspark.sql.types import DecimalType

group_cols = ["PurchaseOrderID", "DueDate","ModifiedDate"]

DailyTop5Sales = (
   DailyTop5Sales.groupBy(group_cols)
          .agg(
              sum("LineTotal").alias("SubTotal")
          )
).withColumn(
    "SubTotal",col("SubTotal").cast(DecimalType(18,2))
).withColumn(
    'DueDate', date_format(col("DueDate"), "MM-dd-yyyy")
).withColumn(
    'ModifiedDate', date_format(col("ModifiedDate"), "MM-dd-yyyy")
)

display(
    DailyTop5Sales.select("PurchaseOrderID", "DueDate","ModifiedDate","SubTotal").orderBy(col("SubTotal").desc()).limit(5)
)

#df.orderBy(col("score").desc()).head(5)
#display(
#    DailyTop5Sales.orderBy(col("SubTotal").desc()).head(5)
#)

# You can predefine the struct schema

In [None]:
%%pyspark
from pyspark.sql.types import *
from pyspark.sql.functions import *

orderSchema = StructType([
    StructField("PurchaseOrderID", IntegerType()),
    StructField("PurchaseOrderDetailID", IntegerType()),
    StructField("DueDate", DateType()),
    StructField("OrderQty", IntegerType()),
    StructField("ProductID", IntegerType()),
    StructField("UnitPrice", DecimalType(12,2)),
    StructField("LineTotal", DecimalType(12,2)),
    StructField("ReceivedQty", DecimalType(12,2)),
    StructField("RejectedQty", DecimalType(12,2)),
    StructField("StockedQty", DecimalType(12,2)),
    StructField("ModifiedDate", DateType())
    ])

DailyTop5Sales = spark.read.load('abfss://root@adlesilabs.dfs.core.windows.net/demofiles/csv/PurchaseOrderDetail.csv'
             ,format='csv'
             ,schema=orderSchema
             ,header=True
)
display(DailyTop5Sales.limit(3))

In [None]:
DailyTop5Sales.printSchema()

In [None]:
group_cols = ["PurchaseOrderID", "DueDate","ModifiedDate"]

DailyTop5Sales = (
   DailyTop5Sales.groupBy(group_cols)
          .agg(
              sum("LineTotal").alias("SubTotal")
          )
)

#.orderBy(col("SubTotal").desc())
display(
    DailyTop5Sales.select("PurchaseOrderID", "DueDate","ModifiedDate","SubTotal").orderBy(col("SubTotal").desc()).limit(5)
)


In [None]:
# This dataframe is Good if we do not know Python
DailyTop5Sales_sql = DailyTop5Sales.select("PurchaseOrderID", "DueDate","ModifiedDate","SubTotal").orderBy(col("SubTotal").desc())

# This dataframe is Good if we do know Python
DailyTop5Sales = DailyTop5Sales.select("PurchaseOrderID", "DueDate","ModifiedDate","SubTotal").orderBy(col("SubTotal").desc()).limit(5)

## Different way to select column(s)

In [None]:
display(DailyTop5Sales["DueDate", "SubTotal"])


# Integrate SQL and Apache Spark pools in Azure Synapse Analytics

In [None]:
# we cant use native sql from a DataFrame
# without converting it
# SQL does not design to work with in memory data storage

select * from DailyTop5Sales_sql

In [None]:
DailyTop5Sales_sql.createOrReplaceTempView('DailyTop5Sales')

In [None]:
%%sql
SELECT * FROM DailyTop5Sales ORDER BY SubTotal DESC LIMIT 5

In [None]:
spark.sql("SELECT * FROM DailyTop5Sales ORDER BY SubTotal DESC LIMIT 5")

In [None]:
display(spark.sql("SELECT * FROM DailyTop5Sales ORDER BY SubTotal DESC LIMIT 5"))

In [None]:
results = spark.sql("SELECT * FROM DailyTop5Sales ORDER BY SubTotal DESC LIMIT 5")
display(results)

In [None]:
# will fail and guest why?
spark.sql("DROP DATABASE IF EXISTS TopDailySales CASCADE")

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS TopDailySales")

results.write.mode("overwrite").saveAsTable("TopDailySales.DailyTop5Sales")

In [None]:
df = spark.sql("SELECT * FROM TopDailySales.DailyTop5Sales")

display(df)

In [None]:
%%spark
// Make sure the name of the dedcated SQL pool (SQLPool01 below) matches the name of your SQL pool.
val df = spark.sqlContext.sql("select * from DailyTop5Sales")

df.write.synapsesql("SQLPool01.dbo.Top5Purchases", Constants.INTERNAL)

In [None]:
# Write using AAD Auth to internal table
# Add required imports
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants

# Configure and submit the request to write to Synapse Dedicated SQL Pool
# Sample below is using AAD-based authentication approach; See further examples to leverage SQL Basic auth.
(df.write
 # If `Constants.SERVER` is not provided, the `<database_name>` from the three-part table name argument
 # to `synapsesql` method is used to infer the Synapse Dedicated SQL End Point.
 .option(Constants.SERVER, "<sql-server-name>.sql.azuresynapse.net")
 # Like-wise, if `Constants.TEMP_FOLDER` is not provided, the connector will use the runtime staging directory config (see section on Configuration Options for details).
 .option(Constants.TEMP_FOLDER, "abfss://<container_name>@<storage_account_name>.dfs.core.windows.net/<some_base_path_for_temporary_staging_folders>")
 # Choose a save mode that is apt for your use case.
 # Options for save modes are "error" or "errorifexists" (default), "overwrite", "append", "ignore".
 # refer to https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes
 .mode("overwrite")
 # Required parameter - Three-part table name to which data will be written
 .synapsesql("<database_name>.<schema_name>.<table_name>"))


# Write using AAD Auth to external table
# Add required imports
import com.microsoft.spark.sqlanalytics
from com.microsoft.spark.sqlanalytics.Constants import Constants

# Setup and trigger the read DataFrame for write to Synapse Dedicated SQL Pool.
# Sample below is using AAD-based authentication approach; See further examples to leverage SQL Basic auth.
(df.write
 # If `Constants.SERVER` is not provided, the `<database_name>` from the three-part table name argument
 # to `synapsesql` method is used to infer the Synapse Dedicated SQL End Point.
 .option(Constants.SERVER, "<sql-server-name>.sql.azuresynapse.net")
 # Set name of the data source definition that is defined with database scoped credentials.
 # https://learn.microsoft.com/sql/t-sql/statements/create-external-data-source-transact-sql?view=sql-server-ver15&tabs=dedicated#h-create-external-data-source-to-access-data-in-azure-storage-using-the-abfs-interface
 .option(Constants.DATA_SOURCE, "<data_source_name>")
 # Choose a save mode that is apt for your use case.
 # Options for save modes are "error" or "errorifexists" (default), "overwrite", "append", "ignore".    
 # refer to https://spark.apache.org/docs/latest/sql-data-sources-load-save-functions.html#save-modes 
 .mode("overwrite")
 # Required parameter - Three-part table name to which data will be written
 .synapsesql("<database_name>.<schema_name>.<table_name>",
             # Optional Parameter which is used to specify table type. Default is internal i.e. Constants.INTERNAL. 
             # For external table type, the value is Constants.EXTERNAL.
             Constants.EXTERNAL,
             # Optional parameter that is used to specify external table's base folder; defaults to `database_name/schema_name/table_name`
             "/path/to/external/table"))

[Refrence:](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/synapse-spark-sql-pool-import-export?tabs=scala%2Cpython1%2Cpython2%2Cpython3%2Cscala4%2Cscala5)