In [0]:
%run /Users/hilla.abramov@gmail.com/utils/stream_utils/extract_aws_credentials

In [0]:
class StreamProcessor:
    def __init__(self, kinesis_stream_name, delta_table_name, json_schema, cleaning_function):
        self.kinesis_stream_name = kinesis_stream_name
        self.delta_table_name = delta_table_name
        self.json_schema = json_schema
        self.input_streaming_df = None
        self.cleaned_streaming_df = None
        self.cleaning_function = cleaning_function

    def read_stream_from_kinesis_to_spark_df(self):
        self.input_streaming_df = spark.readStream \
                                        .format('kinesis') \
                                        .option('streamName', self.kinesis_stream_name) \
                                        .option('initialPosition','earliest') \
                                        .option('region','us-east-1') \
                                        .option('awsAccessKey', ACCESS_KEY) \
                                        .option('awsSecretKey', SECRET_KEY) \
                                        .load() \
                                        .selectExpr("cast (data as STRING) jsonData") \
                                        .select(from_json("jsonData", self.json_schema).alias(self.kinesis_stream_name)) \
                                        .select(f"{self.kinesis_stream_name}.*")

    def clean_stream_data(self):
        self.cleaned_streaming_df = self.cleaning_function(self.input_streaming_df)
    
    def write_stream_to_delta_table(self):
        
        checkpoint_location = f"/tmp/delta/{self.delta_table_name}/_checkpoints/"
        
        # first remove checkpoint location if already exists
        if checkpoint_location in dbutils.fs.ls("/tmp"):
            dbutils.fs.rm(checkpoint_location, True)

        # write stream to delta table
        self.cleaned_streaming_df.writeStream \
            .format("delta") \
            .outputMode("append") \
            .option("checkpointLocation", checkpoint_location) \
            .table(self.delta_table_name)
    
    def run_pipeline(self):
        self.read_stream_from_kinesis_to_spark_df()
        self.clean_stream_data()
        self.write_stream_to_delta_table()