In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import sys
import os
sys.path.append(os.path.abspath('./odibi_de_v2'))
os.environ["PYTHONDONTWRITEBYTECODE"] = "1"

In [0]:
from odibi_de_v2.transformer import PandasColumnAdder

In [0]:
from typing import Union, Dict, Any, List
import pandas as pd
from pyspark.sql import DataFrame as SparkDataFrame

from odibi_de_v2.core import Framework, ErrorType
from odibi_de_v2.utils import enforce_types, log_call
from odibi_de_v2.logger import log_exceptions
from odibi_de_v2.transformer import TransformerProvider


class TransformerFromConfig:
    """
    Applies one or more transformations to a Pandas or Spark DataFrame using a config-driven approach.

    This class delegates to the appropriate transformer via TransformerProvider based on the framework.

    Args:
        framework (Framework): Enum indicating whether to use Pandas or Spark transformers.

    Example Config Format:
        [
            {
                "transformer": "PandasColumnRenamer",
                "params": {"column_map": {"old": "new"}}
            },
            {
                "transformer": "PandasColumnDropper",
                "params": {"columns_to_drop": ["unnecessary_col"]}
            }
        ]
    """

    @log_call(module="TRANSFORMATION", component="TransformerFromConfig")
    @enforce_types(strict=True)
    @log_exceptions(
        module="TRANSFORMATION",
        component="TransformerFromConfig",
        error_type=ErrorType.INIT_ERROR,
        raise_type=RuntimeError)
    def __init__(self, framework: Framework):
        self.framework = framework
        self.provider = TransformerProvider(framework=framework)

    @log_call(module="TRANSFORMATION", component="TransformerFromConfig")
    @enforce_types(strict=True)
    @log_exceptions(
        module="TRANSFORMATION",
        component="TransformerFromConfig",
        error_type=ErrorType.TRANSFORM_ERROR,
        raise_type=RuntimeError)
    def transform(
        self,
        data: Union[pd.DataFrame, SparkDataFrame],
        config: list
    ) -> Union[pd.DataFrame, SparkDataFrame]:
        """
        Applies one or multiple transformations using config-driven logic.

        Args:
            data (DataFrame): The input DataFrame.
            config (Union[Dict, List[Dict]]): One or more transformer configs.
                Each dict must include:
                    - "transformer": Name of the transformer class
                    - "params": Dict of keyword arguments for the transformer

        Returns:
            DataFrame: The transformed DataFrame.
        """
        if isinstance(config, dict):
            config = [config]

        for step in config:
            transformer_name = step["transformer"]
            transformer_params = step.get("params", {})
            data = self.provider.transform(transformer_name, data, **transformer_params)

        return data


In [0]:
from odibi_de_v2.core import Framework
# from odibi_de_v2.transformer import TransformerFromConfig
from pyspark.sql import SparkSession
import pandas as pd

# ---------- Setup ----------
spark = SparkSession.builder.appName("TransformerFromConfigTest").getOrCreate()
spark_df = spark.createDataFrame([(1, "A", 3.0), (2, "B", 6.0)], ["col1", "col2", "col3"])
pandas_df = pd.DataFrame({"col1": [1, 2], "col2": ["A", "B"], "col3": [3.0, 6.0]})

# ---------- Shared Configs ----------
column_map = {"col1": "id", "col2": "name"}
columns_to_drop = ["col3"]
value_map = {"name": {"A": "X", "B": "Y"}}
column_order = ["name", "id"]
case_style = "snake_case"
exclude_columns = ["name"]

# ---------- Pandas Config ----------
pandas_configs = [
    {"transformer": "PandasColumnRenamer", "params": {"column_map": column_map}},
    {"transformer": "PandasColumnDropper", "params": {"columns_to_drop": columns_to_drop}},
    {"transformer": "PandasValueReplacer", "params": {"value_map": value_map}},
    {"transformer": "PandasColumnReorderer", "params": {"column_order": column_order, "retain_unspecified": True}},
    {"transformer": "PandasColumnAdder", "params": {"column_name": "static_col", "value": "static"}},
    {"transformer": "PandasColumnNameStandardizer", "params": {"case_style": case_style, "exclude_columns": exclude_columns}},
]

# ---------- Spark Config ----------
spark_configs = [
    {"transformer": "SparkColumnRenamer", "params": {"column_map": column_map}},
    {"transformer": "SparkColumnDropper", "params": {"columns_to_drop": columns_to_drop}},
    {"transformer": "SparkValueReplacer", "params": {"value_map": value_map}},
    {"transformer": "SparkColumnReorderer", "params": {"column_order": column_order, "retain_unspecified": True}},
    {"transformer": "SparkColumnAdder", "params": {"column_name": "static_col", "value": "static"}},
    {"transformer": "SparkColumnNameStandardizer", "params": {"case_style": case_style, "exclude_columns": exclude_columns}},
]

# ---------- Run Pandas ----------
print("PANDAS RESULTS")
pandas_runner = TransformerFromConfig(framework=Framework.PANDAS)
df_pandas_transformed = pandas_runner.transform(pandas_df, pandas_configs)
print(df_pandas_transformed)

# ---------- Run Spark ----------
print("SPARK RESULTS")
spark_runner = TransformerFromConfig(framework=Framework.SPARK)
df_spark_transformed = spark_runner.transform(spark_df, spark_configs)
df_spark_transformed.show()
