In [0]:
from pyspark.sql.functions import col, to_timestamp, initcap, trim, year, month, dayofmonth, date_format, expr, round, when
from etl_lib.transformations import rename_column
from etl_lib.io import *
from etl_lib.steps import clean_df, add_features
from utils.env_utils import get_spark_session, get_logger



In [None]:
spark = get_spark_session()
logger = get_logger(__name__)

In [0]:
client_id = "..."
client_secret = "..."
tenant_id = "..."
storage_account = "stgetlprj01"

# Set up OAuth connection to ADLS Gen2
spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net",
               "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net",
               f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
bronze_path = f"abfss://bronze@{storage_account}.dfs.core.windows.net/retail/year=*/data.csv"
df_all_years = read_from_adls(path=bronze_path, spark=spark, options={"header": "true", "inferSchema": "true"})
df_all_years.show(5)

### 1. Create copy of dataset

In [0]:
# Create a working copy so original dataframe is preserved
df_bronze = df_all_years


### 2. Clean data and add features

In [None]:
logger.info(f"Number of rows before cleaning: {df_bronze.count()}")
df_bronze = clean_df(df_bronze)
df_bronze = add_features(df_bronze)
logger.info(f"Number of rows after cleaning: {df_bronze.count()}")

In [0]:
display(df_bronze)

### 4. Saving in parquet format

In [0]:
silver_path = f"abfss://silver@{storage_account}.dfs.core.windows.net/retail/"
write_to_adls(df=df_bronze,path=silver_path, mode="overwrite" )