# Transform data by using Spark

This Jupyter Notebook demonstrates how to use Apache Spark to transform and process data. It includes steps to set up the Spark session, create necessary directories, populate a list of files to be processed, convert these files to Parquet format, and read the transformed data for further analysis.

In [2]:
import os
from pyspark.sql.functions import lit
from lib.common_functions import *
from lib.configuration import *
from files.output.ad_works.silver.schemas.dimdate import *

In [3]:
spark = get_spark_session('ad_works')
spark.active()

In [4]:

# checks if a directory specified by `data_lake_path` exists. 
data_lake_path = f'{ad_works_input_path}/data_lake'

if not os.path.exists(data_lake_path):
    os.makedirs(data_lake_path)
    print(f'{data_lake_path} created')
else:
    print(f'{data_lake_path} already exists')

/home/jovyan/code/files/input/ad_works/data_lake already exists


In [5]:

def populate_file_dict_list(directory):
    
    file_dict_list = []
    file_delimiter = ''
    
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_delimiter = '\t'
        else:
            file_delimiter = ','    

        # Check if the file is a directory
        if not os.path.isdir(os.path.join(directory, file_name)):
            
            file_info = {'file_to_load': file_name,'file_delimiter':file_delimiter}
            file_dict_list.append(file_info)

    return file_dict_list

# Example usage
directory = f'{ad_works_input_path}/bronze'
file_dict_list = populate_file_dict_list(directory)
print(file_dict_list)

[{'file_to_load': 'DimCurrency_20250201.txt', 'file_delimiter': '\t'}, {'file_to_load': 'DimCustomer_20250201.txt', 'file_delimiter': '\t'}, {'file_to_load': 'DimDate_20250201.txt', 'file_delimiter': '\t'}, {'file_to_load': 'DimDate_20250202.txt', 'file_delimiter': '\t'}, {'file_to_load': 'DimDate_20250203.txt', 'file_delimiter': '\t'}]


In [6]:
def convert_to_parquet(file_info):
    
    file_name_date = file_info['file_to_load'].split('.')[0]
    file_date = file_name_date.split('_')[1]
    file_name = file_name_date.split('_')[0].lower()
    year = file_date[:4]
    month = file_date[4:6]
    day = file_date[6:8]
    
    if not os.path.exists(f'{ad_works_input_path}/data_lake/{file_name}/year={year}/month={month}/day={day}') == True:
    
        file_path = f'{ad_works_input_path}/bronze/{file_info["file_to_load"]}'
        #df = spark.read.option("delimiter", "\t").csv(file_path, header=True, inferSchema=True)
        
        df = spark.read.format("csv").load(file_path, header=False, inferSchema=True, delimiter=file_info['file_delimiter'])
        
        df = df.withColumn('CreatedDate', lit(f"{year}-{month}-{day}")).withColumn('UpdatedDate', lit(f"{year}-{month}-{day}"))

        output_dir = f"{ad_works_input_path}/data_lake/{file_name}/year={year}/month={month}/day={day}"
        os.makedirs(output_dir, exist_ok=True)
        
        df.write.parquet(output_dir, mode='overwrite')
        
        print(f'{file_name_date} moved to parquet')
        
    else:
        print(f'{file_name_date} already exists in parquet')

for file_info in file_dict_list:
    convert_to_parquet(file_info)

DimCurrency_20250201 already exists in parquet
DimCustomer_20250201 already exists in parquet
DimDate_20250201 already exists in parquet
DimDate_20250202 already exists in parquet
DimDate_20250203 already exists in parquet


In [7]:
DimDate_2020 = spark.read.parquet(f'{ad_works_input_path}/data_lake/DimDate/year=2025/month=02/day=03')
DimDate_2020.show(n=10000)

+--------+----------+---+---------+---------+--------+---+---+---+------+------+----+----+----+----+----+----+----+----+-----------+-----------+
|     _c0|       _c1|_c2|      _c3|      _c4|     _c5|_c6|_c7|_c8|   _c9|  _c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|CreatedDate|UpdatedDate|
+--------+----------+---+---------+---------+--------+---+---+---+------+------+----+----+----+----+----+----+----+----+-----------+-----------+
|20230801|01/08/2023|  3|  Tuesday|   Martes|   Mardi|  1|213| 31|August|Agosto|Août|   8|   4|2023|   2|   1|2024|   1| 2025-02-03| 2025-02-03|
|20230802|02/08/2023|  4|Wednesday|Miércoles|Mercredi|  2|214| 31|August|Agosto|Août|   8|   4|2023|   2|   1|2024|   1| 2025-02-03| 2025-02-03|
|20230803|03/08/2023|  5| Thursday|   Jueves|   Jeudi|  3|215| 31|August|Agosto|Août|   8|   4|2023|   2|   1|2024|   1| 2025-02-03| 2025-02-03|
|20230804|04/08/2023|  6|   Friday|  Viernes|Vendredi|  4|216| 31|August|Agosto|Août|   8|   4|2023|   2|   1|2024|   1| 2025-02-0

In [8]:
# sql_transform = spark.sql("SELECT *, YEAR(OrderDate) AS Year, MONTH(OrderDate) AS Month FROM sales_orders")
# display(sql_transform.limit(5))
# sql_transform.write.partitionBy("Year","Month").saveAsTable('transformed_orders', format='parquet', mode='overwrite', path='/transformed_orders_table')