## Creating a Table with Spark SQL

In [1]:
%%sql

CREATE TABLE healthcare_sql2 (
    `PatientId` INT,
    `Gender` STRING,
    `Scholarship` BOOLEAN
)

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 2, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

## Creating a Table with PySpark and the Delta Table Builder API

#### 1 - Define a Schema with PySpark

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

# Define the Delta table schema
healthcare_reduced_schema = StructType([
    StructField("PatientId", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Scholarship", BooleanType(), True)
])

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 4, Finished, Available, Finished)

#### 2 - Create the Table with the Delta Table Builder

In [3]:
from delta.tables import DeltaTable

# Create Delta table with the defined schema
DeltaTable.create(spark) \
    .tableName("healthcare_delta") \
    .addColumns(healthcare_reduced_schema) \
    .execute()

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 5, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x71bb6fa5be80>

## Updating the Schema with createOrReplace

####  1. Define the Updated Schema

In [4]:
updated_healthcare_schema = StructType([
    StructField("PatientId", IntegerType(), True),
    StructField("Gender", StringType(), True),
    StructField("Scholarship", BooleanType(), True),
    StructField("Age", IntegerType(), True)
])

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 6, Finished, Available, Finished)

#### 2. Use createOrReplace to Update the Table Schema

In [5]:
# Update the Delta table schema using createOrReplace
DeltaTable.createOrReplace(spark) \
    .tableName("healthcare_delta") \
    .addColumns(updated_healthcare_schema) \
    .execute()

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 7, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x71bb6fb1e140>

## Incremental Loading with Spark SQL

In [6]:
df = (
    spark.read.format("csv")
        .option("header","true")
        .schema(updated_healthcare_schema)
        .load("Files/initial_healthcare_data.csv")
)

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 8, Finished, Available, Finished)

In [7]:
df.createOrReplaceTempView("initial_healthcare_load")

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 9, Finished, Available, Finished)

In [8]:
%%sql
MERGE INTO healthcare_delta AS target
USING initial_healthcare_load AS source
ON target.PatientId = source.PatientId
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 10, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 4 fields>

In [10]:
df_view = spark.table('healthcare_delta')
display(df_view)

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f5cf37cb-5091-4211-b88f-b1a9bb0c50ba)

## Incremental Loading with the Delta library 

###### documentation : https://docs.delta.io/delta-update/#upsert-into-a-table-using-merge

In [13]:
# get hc deltat table
healthcare_delta = DeltaTable.forName(spark, "healthcare_delta")

# create a DF of the updates (from Lakehouse Files area)
updates_df = (
    spark.read.format("csv")
    .option("header", "true")
    .schema(updated_healthcare_schema)
    .load("Files/update_healthcare_data.csv")

)

# perform the merge
(
    healthcare_delta.alias("target")
        .merge(updates_df.alias("source"),
        "target.PatientID = source.PatientID")
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
)

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 15, Finished, Available, Finished)

In [14]:
df_view = spark.table('healthcare_delta')
display(df_view)

StatementMeta(, 3f906f11-66b8-4ae2-8cb8-eb0f1aa39251, 16, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 05869f67-b169-412e-9c19-be8a8d6a6c6e)