In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, DoubleType, ArrayType
import pyspark.sql.functions as F
import logging

In [0]:
%run ./logger

In [0]:
bronze_logger = Logger("bronze_logger", logging.INFO).setup()
bronze_logger.info("Logging Started...in bronze ")

In [0]:
date = '09-01-2023'


In [0]:
base_path = '/Volumes/students_data/default/landing'
filename = 'LMS_'+date+'.csv'
file_path = base_path+'/'+filename

catalog_name = "students_data"
bronze_table_name = "bronze_students_table"
bronze_schema_name = "students_bronze"


In [0]:
class Bronze:
    def __init__(self, file_path, catalog_name , schema_name, table_name):
        self.file_path = file_path
        self.catalog_name = catalog_name
        self.schema_name = schema_name
        self.table_name = table_name

    def get_schema(self):
        schema = StructType([
                    StructField('Student_ID', IntegerType(), True), 
                    StructField('Name', StringType(), True), 
                    StructField('Age', IntegerType(), True), 
                    StructField('Gender', StringType(), True), 
                    StructField('Grade_Level', IntegerType(), True), 
                    StructField('Course_ID', StringType(), True), 
                    StructField('Course_Name', StringType(), True), 
                    StructField('Enrollment_Date', StringType(), True), 
                    StructField('Completion_Date', StringType(), True), 
                    StructField('Status', StringType(), True), 
                    StructField('Final_Grade', StringType(), True), 
                    StructField('Attendance_Rate', IntegerType(), True), 
                    StructField('Time_Spent_on_Course_hrs', IntegerType(), True), 
                    StructField('Assignments_Completed', IntegerType(), True), 
                    StructField('Quizzes_Completed', IntegerType(), True), 
                    StructField('Forum_Posts', IntegerType(), True), 
                    StructField('Messages_Sent', IntegerType(), True), 
                    StructField('Quiz_Average_Score', IntegerType(), True), 
                    StructField('Assignment_Scores', StringType(), True), 
                    StructField('Assignment_Average_Score', DoubleType(), True), 
                    StructField('Project_Score', IntegerType(), True), 
                    StructField('Extra_Credit', IntegerType(), True), 
                    StructField('Overall_Performance', IntegerType(), True), 
                    StructField('Feedback_Score', DoubleType(), True), 
                    StructField('Parent_Involvement', StringType(), True), 
                    StructField('Demographic_Group', StringType(), True), 
                    StructField('Internet_Access', StringType(), True),
                    StructField('Learning_Disabilities', StringType(), True), 
                    StructField('Preferred_Learning_Style', StringType(), True), 
                    StructField('Language_Proficiency', StringType(), True), 
                    StructField('Participation_Rate', StringType(), True), 
                    StructField('Completion_Time_Days', IntegerType(), True), 
                    StructField('Performance_Score', DoubleType(), True), 
                    StructField('Course_Completion_Rate', DoubleType(), True)
                ])
        return schema
    
    def read_landing(self):
        df = spark.read.format("csv").option("header", True).schema(self.get_schema()).load(self.file_path).withColumn("Load_Time", F.current_timestamp()).withColumn("File_Name", F.lit(self.file_path))
        bronze_logger.info("bronze data read completed...")
        return df
    
    def write_df(self, df):
        df.write.format("delta").mode("overwrite").saveAsTable(f"{self.catalog_name}.{self.schema_name}.{self.table_name}")
        bronze_logger.info("bronze data write completed...")

bronze = Bronze(file_path=file_path, catalog_name=catalog_name, schema_name=bronze_schema_name, table_name=bronze_table_name)
df = bronze.read_landing()
bronze.write_df(df)
        