In [1]:
import os
import sys
import json
from input.Reader import Reader
import logging

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
metadata_file = "./metadata/conf.json"
logs_file = "./Logs/application_logs.txt"

In [3]:
# Configure the logging system
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format= "%(asctime)s - " + f"{metadata_file}" + " - %(name)s - %(levelname)s - %(message)s",  # Log message format
    handlers=[
        logging.FileHandler(logs_file),  # Dump logs to a file
        logging.StreamHandler()  # Also print logs to the console
    ]
)

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Dataflow").getOrCreate()


In [5]:
with open(metadata_file, 'r') as file:
    metadata = json.load(file)

In [6]:
reader = Reader(spark)
sources = reader.load_df(metadata['sources'])

2024-12-18 13:42:33,547 - ./metadata/conf.json - Reader - INFO - Schema conversion successful: StructType([StructField('name', StringType(), nullable=True),StructField('age', IntegerType(), nullable=True),StructField('office', StringType(), nullable=True)])
2024-12-18 13:42:33,549 - ./metadata/conf.json - Reader - INFO - Options rertrieved: {'dropFieldIfAllNull': False}
2024-12-18 13:42:35,767 - ./metadata/conf.json - Reader - INFO - Data sources loaded successfully.


In [7]:
sources['person_inputs'].show()

+--------+----+---------+
|    name| age|   office|
+--------+----+---------+
|  xavier|  32|barcelona|
|  miguel|  12|santander|
|  manuel|  56|   murcia|
|  miguel|  56|         |
|ricardio|NULL|   murcia|
|    juan|  45|   getafe|
| ricardo|  37| valencia|
|    fran|  29| alicante|
+--------+----+---------+



In [8]:
from transformations.TransformationManager import TransformationManager
transformator = TransformationManager()
for source_name, source in sources.items():
    sources[source_name] = transformator.apply_transformations(df=source,transformations=metadata['transformations'], input_name=source_name)

2024-12-18 13:42:40,680 - ./metadata/conf.json - TransformationManager - INFO - Applying transformation: validate_fields with params: {'validations': [{'field': 'office', 'validations': ['notEmpty']}, {'field': 'age', 'validations': ['notNull']}]}
2024-12-18 13:42:40,917 - ./metadata/conf.json - TransformationManager - INFO - Applying transformation: add_fields with params: {'addFields': [{'name': 'dt', 'function': 'current_timestamp'}]}
2024-12-18 13:42:40,918 - ./metadata/conf.json - AddFields - INFO - Adding current timestamp column 'dt'.
2024-12-18 13:42:40,932 - ./metadata/conf.json - AddFields - INFO - Added field 'dt' with function 'current_timestamp'.
2024-12-18 13:42:40,935 - ./metadata/conf.json - TransformationManager - INFO - All transformations were applied successfully.


In [9]:
sources['person_inputs'].show()

+--------+----+---------+--------------------+--------------------+
|    name| age|   office|   validation_errors|                  dt|
+--------+----+---------+--------------------+--------------------+
|  xavier|  32|barcelona|                  []|2024-12-18 13:42:...|
|  miguel|  12|santander|                  []|2024-12-18 13:42:...|
|  manuel|  56|   murcia|                  []|2024-12-18 13:42:...|
|  miguel|  56|         |[notEmpty: office...|2024-12-18 13:42:...|
|ricardio|NULL|   murcia|[notNull: age mus...|2024-12-18 13:42:...|
|    juan|  45|   getafe|                  []|2024-12-18 13:42:...|
| ricardo|  37| valencia|                  []|2024-12-18 13:42:...|
|    fran|  29| alicante|                  []|2024-12-18 13:42:...|
+--------+----+---------+--------------------+--------------------+



In [10]:
from output.Writer import Writer
writer = Writer(spark_session=spark)
writer.write_dataframes(sources, metadata['sinks'])

2024-12-18 13:42:41,690 - ./metadata/conf.json - Writer - INFO - Processing sink 'ok' for input 'person_inputs'.
2024-12-18 13:42:41,692 - ./metadata/conf.json - Writer - INFO - Writing DataFrame to path: ./data/output/ok/person with format: csv and save mode: OVERWRITE


2024-12-18 13:42:42,930 - ./metadata/conf.json - Writer - INFO - Successfully wrote DataFrame to path: ./data/output/ok/person
2024-12-18 13:42:42,932 - ./metadata/conf.json - Writer - INFO - Successfully processed sink 'ok' for input 'person_inputs'.
2024-12-18 13:42:42,978 - ./metadata/conf.json - Writer - INFO - Processing sink 'ko' for input 'person_inputs'.
2024-12-18 13:42:42,979 - ./metadata/conf.json - Writer - INFO - Writing DataFrame to path: ./data/output/ko/person with format: json and save mode: OVERWRITE
2024-12-18 13:42:43,471 - ./metadata/conf.json - Writer - INFO - Successfully wrote DataFrame to path: ./data/output/ko/person
2024-12-18 13:42:43,474 - ./metadata/conf.json - Writer - INFO - Successfully processed sink 'ko' for input 'person_inputs'.
