In [None]:
import pandas as pd
from snowflake.snowpark.session import Session
from datetime import datetime
from snowflake.connector.pandas_tools import write_pandas
import numpy as np

# CONFIGURATION

In [None]:
CSV_FILE = '../data/sales_2.csv'

SNOWFLAKE_ACCOUNT = '<snowflake_identifier>'
SNOWFLAKE_USER = '<login_user_name>'
SNOWFLAKE_PASSWORD = '<password>'
SNOWFLAKE_WAREHOUSE = '<warehouse_name>'
SNOWFLAKE_DATABASE = '<database_name>'
SNOWFLAKE_SCHEMA = '<schema_name>'
SNOWFLAKE_ROLE = '<role>'

TARGET_TABLE = 'ETL_TRANSFORMED_SALES'


## Create snowflake conn params

In [None]:

connection_parameters = {
    "account": SNOWFLAKE_ACCOUNT,
    "user": SNOWFLAKE_USER,
    "password": SNOWFLAKE_PASSWORD,
    "warehouse": SNOWFLAKE_WAREHOUSE,
    "database": SNOWFLAKE_DATABASE,
    "schema": SNOWFLAKE_SCHEMA
}

# Extraction (E) Step

In [None]:
try:
    df = pd.read_csv(CSV_FILE)
    print(f"Data extracted successfully from '{CSV_FILE}'.")
    print(f"Initial DataFrame shape: {df.shape}")
    display(df.head())

except FileNotFoundError:
    print(f"Error: The file '{CSV_FILE}' was not found.")
    print("Please check and ensure that teh file exists in the correct folder")

except Exception as e:
    print(f"Error during CSV read: {e}")

if df is None:
    raise SystemExit("Exiting ETL process due to extraction failure.")

# 2. TRANSFORMATION (T) 

In [None]:
# T1: Clean column names (convert to snake_case and lowercase)
df.columns = df.columns.str.replace(' ', '_').str.replace(r'([A-Z])', r'_\1', regex=True).str.lower().str.strip('_')
print("Column names cleaned.")

In [None]:
# Convert to pandas datetime64
df['order_date'] = pd.to_datetime(df['order_date'])

In [None]:
# Numeric conversion
df['sales_amount'] = pd.to_numeric(df['sales_amount'])

In [None]:
# Derived unit price
df['unit_price'] = df['sales_amount'] * 2

In [None]:
# Categorical Transformation (Creating a Sales Tier based on amount)
conditions = [
    df['sales_amount'] >= 500,
    df['sales_amount'] >= 100
]
choices = [
    'High Value',
    'Medium Value'
]
df['sales_tier'] = np.select(conditions, choices, default='Low Value')
print("Categorical transformation 'sales_tier' created.")

In [None]:
# T5: Filtering (Basic Data Quality Check)
initial_rows = len(df)
df = df[df['sales_amount'] > 0]
rows_removed = initial_rows - len(df)
print(f"Data filtered: Removed {rows_removed} rows with non-positive sales amounts.")

In [None]:
df

In [None]:
# Add load timestamp -> must be pandas datetime64
df['load_timestamp'] = pd.to_datetime(datetime.utcnow())

In [None]:
# IMPORTANT: Prepare column names for Snowflake (uppercase is standard best practice)
df.columns = df.columns.str.upper()

# LOAD (L)

In [None]:
try:
    session = Session.builder.configs(connection_parameters).create()
    print("Snowflake Session created successfully.")

    # Convert Pandas DF â†’ Snowpark DF
    snowpark_df = session.create_dataframe(df)

    # Write to Snowflake (overwrite table)
    snowpark_df.write.mode("append").save_as_table(TARGET_TABLE)

    print(f"Data successfully loaded to Snowflake table: {TARGET_TABLE}")

except Exception as e:
    print(f"An error occurred during Snowflake load: {e}")

finally:
    if session:
        session.close()
        print("Snowflake Session closed.")