In [None]:
from pathlib import Path
import findspark
from pyspark.sql import SparkSession
import logging
from setLogger import setLogger
from resolvePath import resolvePath
from transform import transform

In [None]:
spark = (
                SparkSession
                .builder
                .appName("cg-pyspark-assignment-nb")
                .master("local")
                .config("spark.sql.repl.eagerEval.enabled", True)
                .getOrCreate()
    )

In [None]:
# Create and configure logger
logger = logging.getLogger(__name__)

# Call the setLogger method to instantiate handler
logger_obj = setLogger(logger, 'main')
logger = logger_obj.set_handler()
logger.info('Logger succesfully created')

In [None]:
# Resolve paths
path_obj = resolvePath()
data_dir, clp_path, cogo_path, dats_path, okay_path, spar_path, log_path = path_obj.get_path()

# Create a transform object and load CLP data into a dataframe
clp_obj = transform(clp_path, spark)
clp_df = clp_obj.get_data_by_brand('CLP')

cogo_obj = transform(cogo_path, spark)
cogo_df = cogo_obj.get_data_by_brand('COGO')

dats_obj = transform(dats_path, spark)
dats_df = dats_obj.get_data_by_brand('DATS')

okay_obj = transform(okay_path, spark)
okay_df = okay_obj.get_data_by_brand('OKAY')

spar_obj = transform(spar_path, spark)
spar_df = spar_obj.get_data_by_brand('SPAR')

In [None]:
# temperoryClosures attribute transformation

clp_df2= clp_obj.transform_tempClosure_attr(clp_df)
logger.info(f'CLP DF2 count > {clp_df2.count()}')

cogo_df2= cogo_obj.transform_tempClosure_attr(cogo_df)
logger.info(f'COGO DF2 count > {cogo_df2.count()}')

okay_df2= okay_obj.transform_tempClosure_attr(okay_df)
logger.info(f'OKAY DF2 count > {okay_df2.count()}')

dats_df2= dats_obj.transform_tempClosure_attr(dats_df)
logger.info(f'DATS DF2 count > {dats_df2.count()}')


In [None]:
# handoverServices attribute transformation
clp_df3 = clp_obj.explode_array_attribute(clp_df2, 'handoverServices')
cogo_df3 = cogo_obj.explode_array_attribute(cogo_df2, 'handoverServices')
okay_df3 = okay_obj.explode_array_attribute(okay_df2, 'handoverServices')

In [None]:
# sellingPartners attribute transformation
clp_df4 = clp_obj.explode_array_attribute(clp_df3, 'sellingPartners')
cogo_df4 = cogo_obj.explode_array_attribute(cogo_df3, 'sellingPartners')
okay_df4 = okay_obj.explode_array_attribute(okay_df3, 'sellingPartners')

In [None]:
# Union of all brands
clp_cogo_okay = transform.union_brands(clp_df4, cogo_df4, okay_df4)
logger.info(f'clp_cogo_okay count > {clp_cogo_okay.count()}')

In [None]:
logger.info(okay_df4.printSchema())

In [None]:
# sellingPartners attribute transformation
clp_df5 = clp_obj.explode_array_attribute(clp_df4, 'placeSearchOpeningHours')
cogo_df5 = cogo_obj.explode_array_attribute(cogo_df4, 'placeSearchOpeningHours')
okay_df5 = okay_obj.explode_array_attribute(okay_df4, 'placeSearchOpeningHours')
logger.info(okay_df5.printSchema())

In [None]:
struct_placeSearchOpeningHours_attr_list = ['date', 'opens', 'closes', 'isToday', 'isOpenForTheDay']
okay_obj.extract_struct_attributes(okay_df5, 'placeSearchOpeningHours', struct_placeSearchOpeningHours_attr_list)