### Databricks DLT Pipeline Code

This notebook contains the DLT pipeline code for creating bronze tables in PYTHON.

Look for `<CHANGE_HERE: ...>` placeholders in the code and replace them with your values. Detailed instructions follow below.

#### Table Naming Instructions
Before running the code, you need to specify where your tables will be stored. You can use any of these three formats:

1. Three level catalog.schema.table format:
   - Replace `<CHANGE_HERE: catalog>.<CHANGE_HERE: schema>` with your Unity Catalog and schema names
   - Example: `unity_catalog.my_schema.table_name`

2. Two level schema.table format:
   - Replace `<CHANGE_HERE: schema>` with your schema name
   - The default catalog will be used
   - Example: `my_schema.table_name`

3. Simple table name format:
   - Use just the table name
   - Both default catalog and schema will be used
   - Example: `table_name`

#### Table Documentation and Configuration
For each table in the code:

1. Table Comments:
   - Replace `<CHANGE_HERE: enter_table_comment>` with a descriptive comment about the table's purpose and contents
   - Example: "Bronze table containing raw customer transaction data"

2. For Change Feed Tables:
   - Replace `<CHANGE_HERE: 1/2>` with either 1 or 2 to specify the SCD (Slowly Changing Dimension) type:
     - Type 1: Overwrites the old value with the new value
     - Type 2: Maintains history by creating new records for each change

#### Learn More
- [Streaming Tables Documentation](https://docs.databricks.com/aws/en/dlt/streaming-tables) - Learn about streaming tables and their use cases for data ingestion and low-latency streaming transformations.
- [Materialized Views Documentation](https://docs.databricks.com/aws/en/dlt/materialized-views) - Understand how materialized views work and their benefits for incremental data processing.

#### Table: valve_compliance_changes

In [0]:
@dlt.table(name="bronze.valve_compliance_changes")
def source():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/valve_compliance_changes/")
    )

dlt.create_streaming_table(
    name="silver.valve_compliance_changes",
    comment="<CHANGE_HERE: enter_table_comment>"
)

dlt.apply_changes(
    target="silver.valve_compliance_changes",
    source="bronze.valve_compliance_changes",
    keys=['valve_id', 'asset_id'],
    sequence_by="change_timestamp",
    stored_as_scd_type="<CHANGE_HERE: 1/2>"
)


#### Table: asset_config_changes

In [0]:
@dlt.table(name="bronze.asset_config_changes")
def source():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/asset_config_changes/")
    )

dlt.create_streaming_table(
    name="silver.asset_config_changes",
    comment="<CHANGE_HERE: enter_table_comment>"
)

dlt.apply_changes(
    target="silver.asset_config_changes",
    source="bronze.asset_config_changes",
    keys=['config_id', 'asset_id'],
    sequence_by="change_timestamp",
    stored_as_scd_type="<CHANGE_HERE: 1/2>"
)


#### Table: calibration_records

In [0]:
@dlt.table(name="bronze.calibration_records")
def calibration_records_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/calibration_records/")
    )

@dlt.table(name="silver.calibration_records")
@dlt.expect("valid_drift_percentage", "drift_percentage BETWEEN 0 AND 100")
def calibration_records_silver():
    return spark.readStream.table("bronze.calibration_records")


#### Table: inspectors

In [0]:
@dlt.table(name="bronze.inspectors")
def inspectors_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/inspectors/")
    )

@dlt.table(name="silver.inspectors")

def inspectors_silver():
    return spark.readStream.table("bronze.inspectors")


#### Table: shift_schedule

In [0]:
@dlt.table(name="bronze.shift_schedule")
def shift_schedule_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/shift_schedule/")
    )

@dlt.table(name="silver.shift_schedule")

def shift_schedule_silver():
    return spark.readStream.table("bronze.shift_schedule")


#### Table: site_info

In [0]:
@dlt.table(name="bronze.site_info")
def site_info_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/site_info/")
    )

@dlt.table(name="silver.site_info")

def site_info_silver():
    return spark.readStream.table("bronze.site_info")


#### Table: daily_weather

In [0]:
@dlt.table(name="bronze.daily_weather")
def daily_weather_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/daily_weather/")
    )

@dlt.table(name="silver.daily_weather")
@dlt.expect("valid_temperature_celsius", "temperature_celsius BETWEEN -40 AND 50")
@dlt.expect("valid_humidity_percentage", "humidity_percentage BETWEEN 0 AND 100")
def daily_weather_silver():
    return spark.readStream.table("bronze.daily_weather")


#### Table: alert_history

In [0]:
@dlt.table(name="bronze.alert_history")
def alert_history_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/alert_history/")
    )

@dlt.table(name="silver.alert_history")
@dlt.expect("valid_threshold_value", "threshold_value BETWEEN 0 AND 1000")
def alert_history_silver():
    return spark.readStream.table("bronze.alert_history")


#### Table: asset

In [0]:
@dlt.table(name="bronze.asset")
def asset_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/asset/")
    )

@dlt.table(name="silver.asset")

def asset_silver():
    return spark.readStream.table("bronze.asset")


#### Table: compliance_regulations

In [0]:
@dlt.table(name="bronze.compliance_regulations")
def compliance_regulations_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/compliance_regulations/")
    )

@dlt.table(name="silver.compliance_regulations")

def compliance_regulations_silver():
    return spark.readStream.table("bronze.compliance_regulations")


#### Table: gas_production

In [0]:
@dlt.table(name="bronze.gas_production")
def gas_production_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/gas_production/")
    )

@dlt.table(name="silver.gas_production")
@dlt.expect("valid_gas_volume_m3", "gas_volume_m3 BETWEEN 0 AND 10000")
@dlt.expect("valid_gas_pressure", "gas_pressure BETWEEN 0 AND 500")
def gas_production_silver():
    return spark.readStream.table("bronze.gas_production")


#### Table: maintenance_record

In [0]:
@dlt.table(name="bronze.maintenance_record")
def maintenance_record_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/maintenance_record/")
    )

@dlt.table(name="silver.maintenance_record")
@dlt.expect("valid_cost", "cost BETWEEN 0 AND 100000")
def maintenance_record_silver():
    return spark.readStream.table("bronze.maintenance_record")


#### Table: sensor_emissions

In [0]:
@dlt.table(name="bronze.sensor_emissions")
def sensor_emissions_bronze():
    return (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.inferColumnTypes", "true")
        .option("multiLine", "true")
        .load("/Volumes/harrison_chen_catalog/synthetic_energy/energy_volume/Gas_Emissions/sensor_emissions/")
    )

@dlt.table(name="silver.sensor_emissions")
@dlt.expect("valid_methane_level", "methane_level BETWEEN 0 AND 1000")
@dlt.expect("valid_co2_level", "co2_level BETWEEN 0 AND 1000")
@dlt.expect("valid_nox_level", "nox_level BETWEEN 0 AND 1000")
@dlt.expect("valid_temperature", "temperature BETWEEN -20 AND 100")
@dlt.expect("valid_pressure", "pressure BETWEEN 0 AND 1000")
@dlt.expect("valid_flow_rate", "flow_rate BETWEEN 0 AND 10000")
def sensor_emissions_silver():
    return spark.readStream.table("bronze.sensor_emissions")
