In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import sys
import os
sys.path.append(os.path.abspath('./odibi_de_v2'))
os.environ["PYTHONDONTWRITEBYTECODE"] = "1"
from odibi_de_v2.connector import AzureBlobConnection, LocalConnection
from odibi_de_v2.ingestion import SparkStreamingDataReader
from odibi_de_v2.storage import SparkStreamingDataSaver
from odibi_de_v2.core import DataType
from odibi_de_v2.core import Framework
from pyspark.sql import SparkSession


In [0]:
from typing import Optional, Union
import pandas as pd
from pyspark.sql import SparkSession, DataFrame as SparkDataFrame

from odibi_de_v2.core.enums import DataType, Framework
from odibi_de_v2.core import BaseConnection
from odibi_de_v2.connector import LocalConnection
from odibi_de_v2.ingestion.spark.spark_data_reader import SparkDataReader
from odibi_de_v2.ingestion.spark.spark_streaming_data_reader import SparkStreamingDataReader
from odibi_de_v2.ingestion.pandas.pandas_data_reader import PandasDataReader


class ReaderProvider:
    """
    Unified provider for reading data using either Pandas, Spark, or Spark Streaming.

    Uses the framework and optional is_stream flag to determine behavior.
    """

    def __init__(
        self,
        connector: Optional[BaseConnection] = None,
        local_engine: Framework = Framework.PANDAS):
        self.connector = connector or LocalConnection()
        self.local_engine = local_engine

    def read(
        self,
        data_type: DataType,
        container: str,
        path_prefix: str,
        object_name: str,
        spark: Optional[SparkSession] = None,
        is_stream: Optional[bool] = False,
        **kwargs
    ) -> Union[pd.DataFrame, SparkDataFrame]:
        """
        Reads data from local or cloud sources using the appropriate engine.

        Args:
            data_type (DataType): File format (CSV, JSON, etc.).
            container (str): Top-level bucket/container.
            path_prefix (str): Folder inside the container.
            object_name (str): File name.
            spark (SparkSession, optional): Required for Spark-based reads.
            is_stream (bool, optional): For Spark only; triggers streaming.
            **kwargs: Reader-specific options.

        Returns:
            pd.DataFrame or Spark DataFrame
        """
        if not self.connector:
            raise ValueError("Connector is required to resolve paths.")

        file_path = self.connector.get_file_path(
            container=container,
            path_prefix=path_prefix,
            object_name=object_name
        )

        options = self.connector.get_storage_options()

        framework = self.connector.framework
        if framework == Framework.LOCAL:
            framework = self.local_engine

        match framework:
            case Framework.PANDAS:
                reader = PandasDataReader()
                return reader.read_data(
                    data_type=data_type,
                    file_path=file_path,
                    storage_options=options,
                    **kwargs
                )

            case Framework.SPARK:
                if spark is None:
                    raise ValueError("Spark session is required for Spark reads.")

                if options:
                    for key, value in options.items():
                        spark.conf.set(key, value)

                reader = SparkStreamingDataReader() if is_stream else SparkDataReader()
                return reader.read_data(
                    data_type=data_type,
                    file_path=file_path,
                    spark=spark,
                    **kwargs
                )

            case _:
                raise NotImplementedError(f"Framework {self.connector.framework} is not supported.")


In [0]:
azure_spark_connector = AzureBlobConnection(
    dbutils.secrets.get("scope", "account_name"),
    dbutils.secrets.get("scope", "account_key"),
    Framework.PANDAS)

provider = ReaderProvider(local_engine=Framework.SPARK)

df = provider.read(
    spark=spark,
    data_type=DataType.JSON,
    container="",
    path_prefix="dbfs:/FileStore",
    object_name="Calendar.json",
    is_stream=False
)
