In [0]:
class TopicBatchProcessor:
    """
    Class which contains the ETL methods for the processing of the Batch Layer data.

    Parameters:
    ----------
    s3_connector_instance: S3Connector
        An instance of the S3Connector class, providing connection to the S3 bucket on AWS. The bucket mount name is read from this object's properties to inform the _file_location attribute of the TopicBatchProcessor object.
    
    topic_name: str
        The name of the topic to ingest the data from. This should match the naming of the Kafka topic created for this dataset in the Kafka (EC2) Client, and the subsequently named sub-directories in the S3 bucket that has been mounted.
    
    Attributes:
    ----------
    _topic_name: str
        The name of the topic the data is being ingested from.
    
    _file_location: str
        This is assigned on initialisation to reflect the pathway of the S3 folder being read from using the _bucket_mount_name attribute of the S3Connector object argument.
    
    _extracted_batch_data: NoneType or pyspark.sql.DataFrame
        This is initialised as None; when data is read from the JSON files into a Spark DataFrame, a cache of that DataFrame is set to this attribute.

    _cleaned_batch_data: NoneType or pyspark.sql.DataFrame 
        This is initialised as None; when the _extracted_batch_data DataFrame is cleaned, a cache of that DataFrame is set to this attribute.
        
    """    
    def __init__(self, s3_connector_instance, topic_name):
        """
        See help(TopicBatchProcessor) for an accurate signature.
        """
        self._topic_name = topic_name
        self._file_location = f"{s3_connector_instance._bucket_mount_name}/topics/{self._topic_name}/partition=0/*.json"
        self._extracted_batch_data = None
        self._cleaned_batch_data = None 
    
    def read_json_files_from_s3_bucket_to_df(self) -> None:
        """
        Method that reads the JSON files from the relevant topic subfolder of mounted S3 bucket to a Spark DataFrame that is cached to the object's _extracted_batch_data attribute.
        """
        file_type = "json"
        infer_schema = "true"
        df = spark.read.format(file_type) \
                        .option("inferSchema", infer_schema) \
                        .load(self._file_location)

        self._extracted_batch_data = df
    
    def clean_dataframe(self, cleaning_function) -> None:
        """
        Method that cleans the DataFrame assigned to the _extracted_batch_data attribute using a bespoke cleaning function, and caches the cleaned DataFrame to the object's _cleaned_batch_data attribute. 

        Argument:
        --------
        cleaning_function: Callable
            A function defining how the dataframe of extracted data should be cleaned in the case of this dataset (and returns the cleaned dataframe).
        """
        if self._extracted_batch_data is not None:
            try:
                cleaned_df = cleaning_function(self._extracted_batch_data)
                self._cleaned_batch_data = cleaned_df.cache()
            except Exception as e:
                print(f"Error occured during cleaning of {self._topic_name} extracted data: {e}")
        else:
            print(f"Error occured during cleaning of {self._topic_name} extracted data:\
                    no dataframe of extracted data saved to object instance.")     