In [None]:
import pandas as pd
from azure.storage.blob import BlobServiceClient, ContainerClient
import os
from io import BytesIO

# Function for basic checks
def dataset_checks(df, name, id_columns=None, datetime_columns=None):
    print(f"\n{'='*30}\nDataset: {name}\n{'='*30}")
    
    # Overview
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Data Types:\n{df.dtypes}\n")
    
    # Null / Missing Values
    print(f"Missing Values:\n{df.isnull().sum()}\n")
    
    # Duplicated Rows
    print(f"Duplicated Rows: {df.duplicated().sum()} rreshta")
    
    # Unique values ,ID Checks
    if id_columns:
        for col in id_columns:
            unique_ids = df[col].nunique()
            print(f"Unique IDs in {col}: {unique_ids} / {len(df)} rows")
    
    # Describe Numerical
    print("\nStatistical Summary (Numerical Columns):")
    print(df.describe())
    
    # Unique Values for Object columns
    for col in df.select_dtypes(include='object').columns:
        print(f"\nUnique Values in {col} (Top 10):")
        print(df[col].value_counts().head(10))
    
    # Datetime Consistency
    if datetime_columns:
        for col in datetime_columns:
            try:
                df[col] = pd.to_datetime(df[col])
                print(f"Parsed {col} successfully to datetime.")
            except Exception as e:
                print(f"Error parsing {col}: {e}")
        
        # If multiple datetimes, check logical ordering
        if len(datetime_columns) >= 2:
            print("\nDatetime logical checks:")
            print(df[datetime_columns].head())

In [None]:
# Connection string to storage in Azure
connection_string = "DefaultEndpointsProtocol=https;AccountName=lhindstorage;celsi;EndpointSuffix=core.windows.net"

# Source conteniner for intial checks
source_container = "raw-data"

# Initialize
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Get
container_client = blob_service_client.get_container_client(source_container)

In [None]:
# Get the specific blob
customers = container_client.get_blob_client('olist_customers_dataset.csv')
#Read into DataFrames
df_customers = pd.read_csv(BytesIO(customers.download_blob().readall()))
#Cheks
dataset_checks(df_customers, "Customers", id_columns=['customer_id'])

In [None]:
orders = container_client.get_blob_client('olist_orders_dataset.csv')
df_orders = pd.read_csv(BytesIO(orders.download_blob().readall()))
dataset_checks(df_orders, "Orders", id_columns=['order_id'], datetime_columns=[
    'order_purchase_timestamp', 'order_approved_at',
    'order_delivered_carrier_date', 'order_delivered_customer_date',
    'order_estimated_delivery_date'
])

In [None]:
products = container_client.get_blob_client('olist_products_dataset.csv')
df_products = pd.read_csv(BytesIO(products.download_blob().readall()))
dataset_checks(df_products, "Products", id_columns=['product_id'])

In [None]:
sellers = container_client.get_blob_client('olist_sellers_dataset.csv')
df_sellers = pd.read_csv(BytesIO(sellers.download_blob().readall()))
dataset_checks(df_sellers, "Sellers", id_columns=['seller_id'])

In [None]:
order_items = container_client.get_blob_client('olist_order_items_dataset.csv')
df_order_items =pd.read_csv(BytesIO(order_items.download_blob().readall()))
dataset_checks(df_order_items, "Order Items", id_columns=['order_id', 'order_item_id', 'product_id', 'seller_id'])

In [None]:
order_payments = container_client.get_blob_client('olist_order_payments_dataset.csv')
df_order_payments = pd.read_csv(BytesIO(order_payments.download_blob().readall()))
dataset_checks(df_order_payments, "Order Payments", id_columns=['order_id'])

In [None]:
order_reviews = container_client.get_blob_client('olist_order_reviews_dataset.csv')
df_order_reviews = pd.read_csv(BytesIO(order_reviews.download_blob().readall()))
dataset_checks(df_order_reviews, "Order Reviews", id_columns=['order_id', 'review_id'], datetime_columns=['review_creation_date', 'review_answer_timestamp'])

In [None]:
geolocation = container_client.get_blob_client('olist_geolocation_dataset.csv')
df_geolocation = pd.read_csv(BytesIO(geolocation.download_blob().readall()))
dataset_checks(df_geolocation, "Geolocation")

In [None]:
product_category_name_translation = container_client.get_blob_client('product_category_name_translation.csv')
df_product_category_name_translation = pd.read_csv(BytesIO(product_category_name_translation.download_blob().readall()))
dataset_checks(df_product_category_name_translation, "Product category name translation")