In [None]:
import pandas as pd
from azure.storage.blob import BlobServiceClient, BlobClient
import os
from io import StringIO , BytesIO

# Connection string 
connection_string = "DefaultEndpointsProtocol=https;AccountName=lhindstorage;AccountKey=SPFV+Hdv7AWTGTpysBBwEVXJvefoYSSr17wLSWB+onc3PMYbwXEcpFRZvHvXF06eePtCy00PIvZu+AStpz1VzA==;EndpointSuffix=core.windows.net"

# Source and destination
source_container = "raw-data"
dest_container = "clean-data"
# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
# Get container clients
source_client = blob_service_client.get_container_client(source_container)
dest_client = blob_service_client.get_container_client(dest_container)

In [None]:
# Define blob clients
datasets = {
    "customers": "olist_customers_dataset.csv",
    "orders": "olist_orders_dataset.csv",
    "products": "olist_products_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "order_items": "olist_order_items_dataset.csv",
    "order_payments": "olist_order_payments_dataset.csv",
    "order_reviews": "olist_order_reviews_dataset.csv",
    "geolocation": "olist_geolocation_dataset.csv",
    "product_category_name_translation": "product_category_name_translation.csv"
}

# Download and load CSVs into pandas DataFrames
dataframes = {}
for key, blob_name in datasets.items():
    blob_data = source_client.get_blob_client(blob_name).download_blob().readall()
    df = pd.read_csv(BytesIO(blob_data))
    dataframes[key] = df

In [None]:
## Drop duplicate
for key in ["geolocation"]:
    dataframes[key].drop_duplicates(inplace=True)

In [None]:
# Drop rows with missing info
df_products = dataframes["products"]
df_products = df_products.dropna(subset=[
    'product_category_name',
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty'
])
dataframes["products"] = df_products

In [None]:
# # Handle missing values

df_orders = dataframes["orders"]
df_order_reviews = dataframes["order_reviews"]

# Fill 'order_approved_at' only if missing and order not canceled
mask_approved = df_orders['order_approved_at'].isna() & (df_orders['order_status'] != 'canceled')
df_orders.loc[mask_approved, 'order_approved_at'] = df_orders.loc[mask_approved, 'order_purchase_timestamp']

# Fill 'order_delivered_carrier_date' only if status is shipped/delivered
mask_carrier = df_orders['order_delivered_carrier_date'].isna() & df_orders['order_status'].isin(['shipped', 'delivered'])
df_orders.loc[mask_carrier, 'order_delivered_carrier_date'] = df_orders.loc[mask_carrier, 'order_purchase_timestamp']

# Fill 'order_delivered_customer_date' only if delivered
mask_delivered = df_orders['order_delivered_customer_date'].isna() & (df_orders['order_status'] == 'delivered')
df_orders.loc[mask_delivered, 'order_delivered_customer_date'] = df_orders.loc[mask_delivered, 'order_estimated_delivery_date']

# Fill remaining NaNs in order date fields as fallback
df_orders.fillna({
    'order_approved_at': df_orders['order_purchase_timestamp'],
    'order_delivered_carrier_date': df_orders['order_purchase_timestamp'],
    'order_delivered_customer_date': df_orders['order_estimated_delivery_date']
}, inplace=True)

# Fill review missing titles/messages with placeholder text
df_order_reviews = dataframes["order_reviews"]
df_order_reviews.fillna({
    'review_comment_title': 'No Title',
    'review_comment_message': 'No Message'
}, inplace=True)
dataframes["orders"] = df_orders
dataframes["order_reviews"] = df_order_reviews

In [None]:
# Upload cleaned DataFrames to clean-data container
for key, df in dataframes.items():
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False)
    csv_data = csv_buffer.getvalue().encode("utf-8")

    blob_client = dest_client.get_blob_client(f"{key}.csv")
    blob_client.upload_blob(csv_data, overwrite=True)