In [0]:
%load_ext autoreload
%autoreload 2

from mdd.logger import Logger
import logging
import datetime
from mdd.environment import Environment
from mdd.datareader import AutoLoaderReader
from mdd.datawriter import DeltaTableWriter

log_folder = "mdd_test"
log_file_name = "test_datareader"
log_timestamp = datetime.datetime.now()
debug = False
Logger.init(log_folder, log_file_name, log_timestamp, debug)

debug = True
config_reader = {
    "sink_name": "bronze.paytronix_mid352_combinedcards",
    "source_format": "cloudFiles",
    "source_relative_path": "paytronix/",
    "source_options": {
        "cloudFiles.format": "csv",  # avro, binaryFile, csv, json, orc, parquet, text, xml
        "cloudFiles.schemaLocation": "loc1",
        "cloudFiles.maxFilesPerTrigger": 1000,
        "cloudFiles.includeExistingFiles": "true",
        "cloudFiles.schemaEvolutionMode": "rescue",
        "rescuedDataColumn": "_rescued_data",
        "pathGlobFilter": "*MID352_CombinedCards_*.csv",
        "skipRows": 2,
        "header": False,
        "ignoreLeadingWhiteSpace": True,
        "ignoreTrailingWhiteSpace": True,
    },
    "source_schema": """
        combine_datetime timestamp
        ,primary_card_number string
        ,added_card_number string
        ,card_template string
        ,current_account_status string
        ,no_cards_in_account int
        ,email string
        ,surviving_account_number string
        ,abandoned_account_number string
        ,account_code string
    """,
}

# read
reader = AutoLoaderReader(spark, config_reader, debug)
df = reader.read_stream()


config_writer = {
    "sink_name": "bronze.paytronix_mid352_combinedcards",
    "sink_projected_script": None,
    "sink_write_mode": "append",  # append, update (complete is not supported)
    "sink_primarykey": "combine_datetime, primary_card_number, added_card_number",  # mandatory for update mode
    "sink_watermark_column": None,
    "sink_update_changes_only": False,
    "sink_write_options": {
        "mergeSchema": False,
        "checkpointLocation": "loc1"  # use different numbers if have multiple sources
    },
    "sink_write_trigger": "availableNow",  # availableNow, ProcessingTime
    "sink_write_trigger_value": True,  # true and false for AvailableNow
    "sink_write_prescript": None,
    "sink_write_postscript": None,
    "sink_validators": None
}

# write
try:
    writer = DeltaTableWriter(spark, df, config_writer, debug)
    query = writer.write_stream()
    query.awaitTermination()

except Exception as e:
    print(e)
    raise e
finally:
    logging.shutdown()

In [0]:
%sql
select
  *
from
  lakehouse.bronze.paytronix_mid352_combinedcards;

select
  `_source_name`,
  `_record_timestamp`,
  count(*)
from
  lakehouse.bronze.paytronix_mid352_combinedcards
group by
  `_source_name`,
  `_record_timestamp`
order by
  `_source_name`,
  `_record_timestamp`

In [0]:
%sql
select `_record_id`, `_record_timestamp`,* from lakehouse.bronze.paytronix_mid352_combinedcards
where primary_card_number = '6000200155719787';

In [0]:
%sql
select `_record_id`, `_record_timestamp`,* from lakehouse.bronze.paytronix_mid352_combinedcards
where primary_card_number = '6000200155719787';