In [1]:
# =============================================================================
# CELLA 1: IMPORTS
# =============================================================================
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import psycopg2
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
# =============================================================================
# CELLA 2: TEST CONNESSIONE DATABASE
# =============================================================================
engine = create_engine('postgresql://postgres:postgres@localhost:5432/ecommerce')

try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version()"))
        version = result.fetchone()[0]
        print("✅ Connessione PostgreSQL riuscita!")
        print(f"Versione: {version}")
except Exception as e:
    print("❌ Errore connessione:")
    print(str(e))
    print("\n🔧 Verifica:")
    print("1. PostgreSQL è avviato?")
    print("2. Password corretta?")
    print("3. Database 'ecommerce' esiste?")

✅ Connessione PostgreSQL riuscita!
Versione: PostgreSQL 17.5 on x86_64-windows, compiled by msvc-19.43.34808, 64-bit


In [3]:
# =============================================================================
# CELLA 3: TROVA FILE CSV
# =============================================================================
import os
print("🔍 Ricerca file CSV nella struttura 3-layer...")

# Determina la directory corrente e la struttura del progetto
current_dir = os.getcwd()
print(f"📂 Directory corrente: {current_dir}")

def find_csv_files(directory):
    """Trova tutti i file CSV in una directory e sottodirectory"""
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                full_path = os.path.join(root, file)
                csv_files.append(full_path)
    return csv_files

# Possibili percorsi per i dati sorgente nella nuova struttura
possible_paths = [
    '1_source_layer/datasets_olist',          # Se eseguito dalla root del progetto
    '../1_source_layer/datasets_olist',       # Se eseguito da una sottocartella
    '../../1_source_layer/datasets_olist',     # Se eseguito da una sottocartella più profonda
    'datasets_olist',                          # Struttura originale (fallback)
    '../datasets_olist',                       # Struttura originale da sottocartella
    '.'                                        # Directory corrente
]

# Trova il percorso corretto
csv_files = []
correct_path = None

for path in possible_paths:
    if os.path.exists(path):
        print(f"✅ Trovato percorso valido: {path}")
        csv_files = find_csv_files(path)
        if csv_files:
            correct_path = path
            break
        else:
            print(f"   ⚠️ Percorso esiste ma nessun CSV trovato")

if not correct_path:
    print("❌ Nessun percorso valido trovato. Cercando in tutta la struttura...")
    # Cerca dalla directory corrente
    csv_files = find_csv_files('.')

# Mostra i file trovati
print(f"\n📊 Trovati {len(csv_files)} file CSV totali:")
if len(csv_files) > 0:
    for file in csv_files[:10]:  # Mostra solo i primi 10
        print(f"   📄 {file}")
    if len(csv_files) > 10:
        print(f"   ... e altri {len(csv_files) - 10} file")

# Cerca specificamente i file Olist
olist_files = [f for f in csv_files if 'olist' in f.lower()]
print(f"\n🛒 File Olist trovati ({len(olist_files)}):")
for file in olist_files:
    print(f"   📄 {os.path.basename(file)}")

# Verifica struttura dataset Olist
expected_files = [
    'olist_orders_dataset.csv',
    'olist_order_items_dataset.csv',
    'olist_customers_dataset.csv',
    'olist_products_dataset.csv',
    'olist_sellers_dataset.csv',
    'olist_order_payments_dataset.csv',
    'olist_order_reviews_dataset.csv',
    'olist_geolocation_dataset.csv',
    'product_category_name_translation.csv'
]

print("\n📋 Verifica completezza dataset:")
found_names = [os.path.basename(f) for f in olist_files]
for expected in expected_files:
    if expected in found_names:
        print(f"   ✅ {expected}")
    else:
        print(f"   ❌ {expected} - MANCANTE!")

# Prova a caricare un file per test
if olist_files:
    test_file = olist_files[0]
    print(f"\n🧪 Test caricamento: {os.path.basename(test_file)}")
    try:
        df_test = pd.read_csv(test_file)
        print(f"✅ Caricamento riuscito! Righe: {len(df_test):,}, Colonne: {len(df_test.columns)}")
        print("Prime colonne:", list(df_test.columns[:5]))
        
        # Suggerisci il BASE_PATH corretto per le celle successive
        if correct_path:
            suggested_base = os.path.join(correct_path, '')  # Aggiunge / finale
            
    except Exception as e:
        print(f"❌ Errore caricamento: {e}")
else:
    print("❌ Nessun file Olist trovato!")

🔍 Ricerca file CSV nella struttura 3-layer...
📂 Directory corrente: C:\Users\Leonardo\Desktop\PROGETTO_DM\3_dw_layer
✅ Trovato percorso valido: ../1_source_layer/datasets_olist

📊 Trovati 13 file CSV totali:
   📄 ../1_source_layer/datasets_olist\olist_customers_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_geolocation_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_orders_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_order_items_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_order_payments_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_order_reviews_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_products_dataset.csv
   📄 ../1_source_layer/datasets_olist\olist_sellers_dataset.csv
   📄 ../1_source_layer/datasets_olist\product_category_name_translation.csv
   📄 ../1_source_layer/datasets_olist\.ipynb_checkpoints\olist_geolocation_dataset-checkpoint.csv
   ... e altri 3 file

🛒 File Olist trovati (13):
   📄 olist_customers_dataset.csv


In [4]:
# =============================================================================
# CELLA 4: CARICAMENTO DATASET
# =============================================================================
print("📁 Caricamento dataset...")
BASE_PATH = '1_source_layer/datasets_olist/'
print(f"✅ Usando percorso rilevato: {BASE_PATH}")
    
# Verifica se il percorso esiste
if not os.path.exists(BASE_PATH):
    # Prova percorsi alternativi
    alt_paths = ['../1_source_layer/datasets_olist/', 'datasets_olist/', '../datasets_olist/']
    for alt_path in alt_paths:
        if os.path.exists(alt_path):
            BASE_PATH = alt_path
            break
    else:
        print(f"❌ Percorso {BASE_PATH} non trovato!")
        print("🔧 Assicurati di:")
        print("   1. Essere nella directory root del progetto")
        print("   2. Aver creato la struttura 3-layer")
        print("   3. Aver copiato i CSV in 1_source_layer/datasets/olist/")

# Lista dei file da caricare
files_to_load = {
    'orders': 'olist_orders_dataset.csv',
    'order_items': 'olist_order_items_dataset.csv', 
    'customers': 'olist_customers_dataset.csv',
    'products': 'olist_products_dataset.csv',
    'sellers': 'olist_sellers_dataset.csv',
    'payments': 'olist_order_payments_dataset.csv',
    'reviews': 'olist_order_reviews_dataset.csv',
    'geolocation': 'olist_geolocation_dataset.csv',
    'translation': 'product_category_name_translation.csv'
}

# Carica i dataset con gestione errori
datasets = {}
failed_files = []

print(f"\n📂 Caricamento da: {BASE_PATH}")
print("-" * 60)

for dataset_name, filename in files_to_load.items():
    filepath = os.path.join(BASE_PATH, filename)
    try:
        print(f"📊 Caricando {filename}...", end='')
        datasets[dataset_name] = pd.read_csv(filepath)
        print(f" ✅ {len(datasets[dataset_name]):,} righe")
    except FileNotFoundError:
        print(f" ❌ Non trovato")
        failed_files.append(filename)
        # Prova percorso alternativo senza join
        try:
            alt_filepath = BASE_PATH + filename
            datasets[dataset_name] = pd.read_csv(alt_filepath)
            print(f"   ✅ Trovato con percorso alternativo: {len(datasets[dataset_name]):,} righe")
            failed_files.remove(filename)
        except:
            pass
    except Exception as e:
        print(f" ❌ Errore: {str(e)[:50]}...")
        failed_files.append(filename)

print("-" * 60)

# Assegna alle variabili globali solo se caricamento riuscito
if len(failed_files) == 0:
    orders = datasets['orders']
    order_items = datasets['order_items']
    customers = datasets['customers']
    products = datasets['products']
    sellers = datasets['sellers']
    payments = datasets['payments']
    reviews = datasets['reviews']
    geolocation = datasets['geolocation']
    translation = datasets['translation']
    
    print(f"\n✅ Tutti i dataset caricati con successo!")
    print("\n📊 Riepilogo dataset:")
    print(f"   Orders:      {len(orders):,} righe")
    print(f"   Order Items: {len(order_items):,} righe")
    print(f"   Customers:   {len(customers):,} righe")
    print(f"   Products:    {len(products):,} righe")
    print(f"   Sellers:     {len(sellers):,} righe")
    print(f"   Payments:    {len(payments):,} righe")
    print(f"   Reviews:     {len(reviews):,} righe")
    print(f"   Geolocation: {len(geolocation):,} righe")
    print(f"   Translation: {len(translation):,} categorie")
    
    # Mostra informazioni aggiuntive
    print(f"\n📈 Statistiche rapide:")
    print(f"   Periodo ordini: {orders['order_purchase_timestamp'].min()[:10]} - {orders['order_purchase_timestamp'].max()[:10]}")
    print(f"   Stati coperti: {customers['customer_state'].nunique()}")
    print(f"   Categorie prodotti: {products['product_category_name'].nunique()}")
    
else:
    print(f"\n❌ Caricamento fallito per {len(failed_files)} file:")

📁 Caricamento dataset...
✅ Usando percorso rilevato: 1_source_layer/datasets_olist/

📂 Caricamento da: ../1_source_layer/datasets_olist/
------------------------------------------------------------
📊 Caricando olist_orders_dataset.csv... ✅ 99,441 righe
📊 Caricando olist_order_items_dataset.csv... ✅ 112,650 righe
📊 Caricando olist_customers_dataset.csv... ✅ 99,441 righe
📊 Caricando olist_products_dataset.csv... ✅ 32,951 righe
📊 Caricando olist_sellers_dataset.csv... ✅ 3,095 righe
📊 Caricando olist_order_payments_dataset.csv... ✅ 103,886 righe
📊 Caricando olist_order_reviews_dataset.csv... ✅ 99,224 righe
📊 Caricando olist_geolocation_dataset.csv... ✅ 1,000,163 righe
📊 Caricando product_category_name_translation.csv... ✅ 71 righe
------------------------------------------------------------

✅ Tutti i dataset caricati con successo!

📊 Riepilogo dataset:
   Orders:      99,441 righe
   Order Items: 112,650 righe
   Customers:   99,441 righe
   Products:    32,951 righe
   Sellers:     3,095

In [5]:
# =============================================================================
# CELLA 5: PULIZIA COMPLETA DATABASE
# =============================================================================
def clean_database():
    """Pulisce tutte le tabelle nell'ordine corretto per rispettare i vincoli FK"""
    print("🧹 Pulizia completa database...")
    
    try:
        with engine.connect() as conn:
            # 1. Prima elimina la fact table (che ha le FK)
            print("🗑️ Eliminazione FACT_SALES...")
            conn.execute(text("DELETE FROM fact_sales"))
            
            # 2. Poi elimina le dimensioni
            print("🗑️ Eliminazione dimensioni...")
            conn.execute(text("DELETE FROM dim_customer"))
            conn.execute(text("DELETE FROM dim_product"))
            conn.execute(text("DELETE FROM dim_seller"))
            conn.execute(text("DELETE FROM dim_time"))
            conn.execute(text("DELETE FROM dim_payment"))
            conn.execute(text("DELETE FROM dim_geography"))  # Nuova dimensione
            
            # 3. Commit tutte le operazioni
            conn.commit()
            print("✅ Database pulito con successo!")
            
    except Exception as e:
        print(f"❌ Errore pulizia database: {e}")
        raise

# Esegui la pulizia
clean_database()

🧹 Pulizia completa database...
🗑️ Eliminazione FACT_SALES...
🗑️ Eliminazione dimensioni...
✅ Database pulito con successo!


In [6]:
# =============================================================================
# CELLA 6: PREPARAZIONE GEOGRAFIA E CALCOLO DISTANZE
# =============================================================================
print("🗺️ PREPARAZIONE DATI GEOGRAFICI...")

# Crea lookup table geografica ottimizzata
print("📍 Creando lookup geografica...")
geo_lookup = geolocation.groupby('geolocation_zip_code_prefix').agg({
    'geolocation_lat': 'mean',  # Media delle coordinate per ZIP
    'geolocation_lng': 'mean',
    'geolocation_city': 'first',  # Prima città per ZIP
    'geolocation_state': 'first'
}).reset_index()

print(f"✅ Lookup geografica creata: {len(geo_lookup):,} ZIP codes unici")

# Funzione per calcolo distanze
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calcola distanza in km tra due punti usando formula haversine"""
    if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
        return None
    
    # Converti in radianti
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Formula haversine
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Raggio Terra in km
    return c * r

# Funzione per determinare regione
def get_region(state):
    regions = {
        'AC': 'Norte', 'AP': 'Norte', 'AM': 'Norte', 'PA': 'Norte', 
        'RO': 'Norte', 'RR': 'Norte', 'TO': 'Norte',
        'AL': 'Nordeste', 'BA': 'Nordeste', 'CE': 'Nordeste', 'MA': 'Nordeste',
        'PB': 'Nordeste', 'PE': 'Nordeste', 'PI': 'Nordeste', 'RN': 'Nordeste', 'SE': 'Nordeste',
        'GO': 'Centro-Oeste', 'MT': 'Centro-Oeste', 'MS': 'Centro-Oeste', 'DF': 'Centro-Oeste',
        'ES': 'Sudeste', 'MG': 'Sudeste', 'RJ': 'Sudeste', 'SP': 'Sudeste',
        'PR': 'Sul', 'RS': 'Sul', 'SC': 'Sul'
    }
    return regions.get(state, 'Unknown')

🗺️ PREPARAZIONE DATI GEOGRAFICI...
📍 Creando lookup geografica...
✅ Lookup geografica creata: 19,015 ZIP codes unici


In [7]:
# =============================================================================
# CELLA 7: ETL DIM_GEOGRAPHY (NUOVA DIMENSIONE)
# =============================================================================
print("🌍 ETL DIM_GEOGRAPHY...")

# Prepara dimensione geografia
dim_geography_data = geo_lookup.copy()
dim_geography_data = dim_geography_data.rename(columns={
    'geolocation_zip_code_prefix': 'zip_code_prefix',
    'geolocation_lat': 'latitude',
    'geolocation_lng': 'longitude',
    'geolocation_city': 'city',
    'geolocation_state': 'state'
})

# Assicura che zip_code_prefix sia string
dim_geography_data['zip_code_prefix'] = dim_geography_data['zip_code_prefix'].astype(str)

# Aggiungi regione
dim_geography_data['region'] = dim_geography_data['state'].apply(get_region)

print(f"✅ DIM_GEOGRAPHY preparata: {len(dim_geography_data):,} ZIP codes")

# Carica nel database
try:
    dim_geography_data.to_sql('dim_geography', engine, if_exists='append', index=False, method='multi')
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_geography"))
        count = result.fetchone()[0]
        print(f"✅ DIM_GEOGRAPHY caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_GEOGRAPHY: {e}")

🌍 ETL DIM_GEOGRAPHY...
✅ DIM_GEOGRAPHY preparata: 19,015 ZIP codes
✅ DIM_GEOGRAPHY caricata: 19,015 righe


In [8]:
# =============================================================================
# CELLA 8: ETL DIM_CUSTOMER CON COORDINATE
# =============================================================================
print("🔄 ETL DIM_CUSTOMER (con coordinate)...")

# Merge customers con geolocation
customers_enriched = customers.merge(
    geo_lookup, 
    left_on='customer_zip_code_prefix', 
    right_on='geolocation_zip_code_prefix', 
    how='left'
)

# Pulisci e prepara
customers_clean = customers_enriched.drop_duplicates(subset=['customer_id'])
customers_clean['customer_city'] = customers_clean['customer_city'].fillna('Unknown')
customers_clean['customer_state'] = customers_clean['customer_state'].fillna('XX')

# Ottieni geography_key
dim_geography_keys = pd.read_sql("SELECT geography_key, zip_code_prefix FROM dim_geography", engine)

# Prepara dati finali
dim_customer_data = customers_clean[[
    'customer_id', 'customer_city', 'customer_state', 
    'customer_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'
]].copy()

dim_customer_data = dim_customer_data.rename(columns={
    'geolocation_lat': 'customer_latitude',
    'geolocation_lng': 'customer_longitude'
})

# Converti zip_code_prefix a string in entrambi i DataFrame per evitare type mismatch
dim_customer_data['customer_zip_code_prefix'] = dim_customer_data['customer_zip_code_prefix'].astype(str)
dim_geography_keys['zip_code_prefix'] = dim_geography_keys['zip_code_prefix'].astype(str)

# Aggiungi geography_key
dim_customer_data = dim_customer_data.merge(
    dim_geography_keys,
    left_on='customer_zip_code_prefix',
    right_on='zip_code_prefix',
    how='left'
)

# Rimuovi colonna duplicata
dim_customer_data = dim_customer_data.drop('zip_code_prefix', axis=1)

print(f"✅ DIM_CUSTOMER preparata: {len(dim_customer_data):,} righe")

# Carica nel database
try:
    dim_customer_data.to_sql('dim_customer', engine, if_exists='append', index=False, method='multi', chunksize=1000)
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_customer"))
        count = result.fetchone()[0]
        print(f"✅ DIM_CUSTOMER caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_CUSTOMER: {e}")

🔄 ETL DIM_CUSTOMER (con coordinate)...
✅ DIM_CUSTOMER preparata: 99,441 righe
   Con coordinate: 99,163 clienti
✅ DIM_CUSTOMER caricata: 99,441 righe


In [9]:
# =============================================================================
# CELLA 9: ETL DIM_PRODUCT CON TRADUZIONI E CLASSIFICAZIONI
# =============================================================================
print("🔄 ETL DIM_PRODUCT (con traduzioni e classificazioni)...")

# Pulizia products
products_clean = products.copy()
products_clean = products_clean.drop_duplicates(subset=['product_id'])
products_clean['product_category_name'] = products_clean['product_category_name'].fillna('outros')

# Join con traduzioni
print("🔗 Aggiungendo traduzioni categorie...")
products_with_translation = products_clean.merge(
    translation[['product_category_name', 'product_category_name_english']], 
    on='product_category_name', 
    how='left'
)

# Se non c'è traduzione, usa il nome originale
products_with_translation['product_category_name_english'] = (
    products_with_translation['product_category_name_english']
    .fillna(products_with_translation['product_category_name'])
)

# Funzioni di classificazione
def classify_weight(weight):
    if pd.isna(weight) or weight == 0:
        return 'Unknown'
    elif weight < 500:
        return 'Light'
    elif weight < 2000:
        return 'Medium'
    elif weight < 5000:
        return 'Heavy'
    else:
        return 'Very Heavy'

def classify_size(length, height, width):
    if pd.isna(length) or pd.isna(height) or pd.isna(width):
        return 'Unknown'
    volume = length * height * width
    if volume < 1000:
        return 'Small'
    elif volume < 8000:
        return 'Medium'
    else:
        return 'Large'

# Applica classificazioni
products_with_translation['weight_category'] = products_with_translation['product_weight_g'].apply(classify_weight)
products_with_translation['size_category'] = products_with_translation.apply(
    lambda x: classify_size(x['product_length_cm'], x['product_height_cm'], x['product_width_cm']), 
    axis=1
)

# Riempi valori numerici mancanti
dim_product_data = products_with_translation.fillna(0)

print(f"✅ DIM_PRODUCT preparata: {len(dim_product_data):,} righe")
print("Top categorie (inglese):")
print(dim_product_data['product_category_name_english'].value_counts().head())

# Carica nel database
try:
    dim_product_data.to_sql('dim_product', engine, if_exists='append', index=False, method='multi', chunksize=1000)
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_product"))
        count = result.fetchone()[0]
        print(f"✅ DIM_PRODUCT caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_PRODUCT: {e}")

🔄 ETL DIM_PRODUCT (con traduzioni e classificazioni)...
🔗 Aggiungendo traduzioni categorie...
✅ DIM_PRODUCT preparata: 32,951 righe
Top categorie (inglese):
product_category_name_english
bed_bath_table     3029
sports_leisure     2867
furniture_decor    2657
health_beauty      2444
housewares         2335
Name: count, dtype: int64
✅ DIM_PRODUCT caricata: 32,951 righe


In [10]:
# =============================================================================
# CELLA 10: ETL DIM_SELLER CON COORDINATE
# =============================================================================
print("🔄 ETL DIM_SELLER (con coordinate)...")

# Merge sellers con geolocation
sellers_enriched = sellers.merge(
    geo_lookup, 
    left_on='seller_zip_code_prefix', 
    right_on='geolocation_zip_code_prefix', 
    how='left'
)

# Pulisci e prepara
sellers_clean = sellers_enriched.drop_duplicates(subset=['seller_id'])
sellers_clean['seller_city'] = sellers_clean['seller_city'].fillna('Unknown')
sellers_clean['seller_state'] = sellers_clean['seller_state'].fillna('XX')

# Prepara dati finali
dim_seller_data = sellers_clean[[
    'seller_id', 'seller_city', 'seller_state', 
    'seller_zip_code_prefix', 'geolocation_lat', 'geolocation_lng'
]].copy()

dim_seller_data = dim_seller_data.rename(columns={
    'geolocation_lat': 'seller_latitude',
    'geolocation_lng': 'seller_longitude'
})

# Converti zip_code_prefix a string per evitare type mismatch
dim_seller_data['seller_zip_code_prefix'] = dim_seller_data['seller_zip_code_prefix'].astype(str)
# dim_geography_keys['zip_code_prefix'] è già convertito nella cella 8

# Aggiungi geography_key
dim_seller_data = dim_seller_data.merge(
    dim_geography_keys,
    left_on='seller_zip_code_prefix',
    right_on='zip_code_prefix',
    how='left'
)

# Rimuovi colonna duplicata
dim_seller_data = dim_seller_data.drop('zip_code_prefix', axis=1)

print(f"✅ DIM_SELLER preparata: {len(dim_seller_data):,} righe")

# Carica nel database
try:
    dim_seller_data.to_sql('dim_seller', engine, if_exists='append', index=False, method='multi', chunksize=1000)
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_seller"))
        count = result.fetchone()[0]
        print(f"✅ DIM_SELLER caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_SELLER: {e}")

🔄 ETL DIM_SELLER (con coordinate)...
✅ DIM_SELLER preparata: 3,095 righe
   Con coordinate: 3,088 venditori
✅ DIM_SELLER caricata: 3,095 righe


In [11]:
# =============================================================================
# CELLA 11: ETL DIM_TIME CON STAGIONI BRASILIANE
# =============================================================================
print("🔄 ETL DIM_TIME (con stagioni)...")

# Estrai tutte le date uniche dagli ordini
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
unique_dates = orders['order_purchase_timestamp'].dt.date.unique()

# Funzione per stagione brasiliana
def get_season_brazil(month):
    if month in [12, 1, 2]:
        return 'Verão'  # Estate
    elif month in [3, 4, 5]:
        return 'Outono'  # Autunno
    elif month in [6, 7, 8]:
        return 'Inverno'  # Inverno
    else:
        return 'Primavera'  # Primavera

# Crea dimensione tempo
time_data = []
for date in unique_dates:
    if pd.notna(date):
        dt = pd.to_datetime(date)
        time_data.append({
            'full_date': date,
            'day_of_week': dt.dayofweek,
            'day_name': dt.strftime('%A'),
            'day_of_month': dt.day,
            'month_num': dt.month,
            'month_name': dt.strftime('%B'),
            'quarter': dt.quarter,
            'year': dt.year,
            'is_weekend': dt.dayofweek >= 5,
            'season_brazil': get_season_brazil(dt.month)
        })

dim_time_data = pd.DataFrame(time_data)
dim_time_data = dim_time_data.sort_values('full_date')

print(f"✅ DIM_TIME preparata: {len(dim_time_data):,} righe")
print(f"Range date: {dim_time_data['full_date'].min()} - {dim_time_data['full_date'].max()}")

# Carica nel database
try:
    dim_time_data.to_sql('dim_time', engine, if_exists='append', index=False, method='multi')
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_time"))
        count = result.fetchone()[0]
        print(f"✅ DIM_TIME caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_TIME: {e}")

🔄 ETL DIM_TIME (con stagioni)...
✅ DIM_TIME preparata: 634 righe
Range date: 2016-09-04 - 2018-10-17
✅ DIM_TIME caricata: 634 righe


In [12]:
# =============================================================================
# CELLA 12: ETL DIM_PAYMENT CON CATEGORIE
# =============================================================================
print("🔄 ETL DIM_PAYMENT (con categorie)...")

# Funzione per classificare rate
def classify_installments(installments):
    if installments == 1:
        return 'Cash/Single Payment'
    elif installments <= 6:
        return 'Short Term'
    elif installments <= 12:
        return 'Medium Term'
    else:
        return 'Long Term'

# Crea combinazioni uniche payment_type + installments
payment_combinations = payments[['payment_type', 'payment_installments']].drop_duplicates()
payment_combinations = payment_combinations.fillna({'payment_installments': 1})

# Aggiungi categorizzazione
payment_combinations['installment_category'] = payment_combinations['payment_installments'].apply(classify_installments)

dim_payment_data = payment_combinations.copy()

print(f"✅ DIM_PAYMENT preparata: {len(dim_payment_data):,} righe")
print("Tipi di pagamento:")
print(dim_payment_data['payment_type'].value_counts())

# Carica nel database
try:
    dim_payment_data.to_sql('dim_payment', engine, if_exists='append', index=False, method='multi')
    
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM dim_payment"))
        count = result.fetchone()[0]
        print(f"✅ DIM_PAYMENT caricata: {count:,} righe")
        
except Exception as e:
    print(f"❌ Errore caricamento DIM_PAYMENT: {e}")

🔄 ETL DIM_PAYMENT (con categorie)...
✅ DIM_PAYMENT preparata: 28 righe
Tipi di pagamento:
payment_type
credit_card    24
boleto          1
voucher         1
debit_card      1
not_defined     1
Name: count, dtype: int64
✅ DIM_PAYMENT caricata: 28 righe


In [13]:
# =============================================================================
# CELLA 13: ETL FACT TABLE - PREPARAZIONE
# =============================================================================
print("🔄 ETL FACT_SALES (preparazione)...")

# Unisci tutti i dataset per creare il fatto
print("📊 Joining datasets...")

# Base: order_items
fact_base = order_items.copy()

# Join con orders per data
fact_base = fact_base.merge(
    orders[['order_id', 'customer_id', 'order_purchase_timestamp', 'order_status']], 
    on='order_id', 
    how='left'
)

# Solo ordini delivered
fact_base = fact_base[fact_base['order_status'] == 'delivered']

# Join con payments (aggrega per ordine)
payments_agg = payments.groupby('order_id').agg({
    'payment_type': 'first',  # Prendi il primo metodo di pagamento
    'payment_installments': 'first',
    'payment_value': 'sum'
}).reset_index()

fact_base = fact_base.merge(payments_agg, on='order_id', how='left')

# Join con reviews (aggrega per ordine)
reviews_agg = reviews.groupby('order_id').agg({
    'review_score': 'mean'
}).reset_index()

fact_base = fact_base.merge(reviews_agg, on='order_id', how='left')

# Pulisci e prepara
fact_base['order_purchase_timestamp'] = pd.to_datetime(fact_base['order_purchase_timestamp'])
fact_base['order_date'] = fact_base['order_purchase_timestamp'].dt.date

# Calcola misure derivate
fact_base['total_item_value'] = fact_base['price'] + fact_base['freight_value']
fact_base['quantity'] = 1  # Ogni riga è un item

# Rimuovi righe con valori critici mancanti
fact_base = fact_base.dropna(subset=['customer_id', 'product_id', 'seller_id', 'order_date'])

print(f"✅ Fact base preparata: {len(fact_base):,} righe")
print(f"Periodo: {fact_base['order_date'].min()} - {fact_base['order_date'].max()}")
print(f"Valore totale: €{fact_base['price'].sum():,.2f}")

🔄 ETL FACT_SALES (preparazione)...
📊 Joining datasets...
✅ Fact base preparata: 110,197 righe
Periodo: 2016-09-15 - 2018-08-29
Valore totale: €13,221,498.11


In [14]:
# =============================================================================
# CELLA 14: CREAZIONE FOREIGN KEYS E CALCOLO METRICHE
# =============================================================================
print("🔗 Creazione foreign keys e calcolo metriche geografiche...")

# Carica le dimensioni con le chiavi surrogate E coordinate
dim_customer_keys = pd.read_sql("""
    SELECT customer_key, customer_id, customer_latitude, customer_longitude, customer_state 
    FROM dim_customer
""", engine)

dim_product_keys = pd.read_sql("SELECT product_key, product_id FROM dim_product", engine)

dim_seller_keys = pd.read_sql("""
    SELECT seller_key, seller_id, seller_latitude, seller_longitude, seller_state 
    FROM dim_seller
""", engine)

dim_time_keys = pd.read_sql("SELECT time_key, full_date FROM dim_time", engine)

dim_payment_keys = pd.read_sql("""
    SELECT payment_key, payment_type, payment_installments 
    FROM dim_payment
""", engine)

# Join per ottenere le foreign keys E coordinate
fact_with_keys = fact_base.copy()

# Customer key e coordinate
fact_with_keys = fact_with_keys.merge(dim_customer_keys, on='customer_id', how='left')

# Product key
fact_with_keys = fact_with_keys.merge(dim_product_keys, on='product_id', how='left')

# Seller key e coordinate
fact_with_keys = fact_with_keys.merge(dim_seller_keys, on='seller_id', how='left')

# Time key
fact_with_keys = fact_with_keys.merge(dim_time_keys, left_on='order_date', right_on='full_date', how='left')

# Payment key
fact_with_keys = fact_with_keys.merge(dim_payment_keys, on=['payment_type', 'payment_installments'], how='left')

# Calcola distanze e metriche geografiche
print("📏 Calcolando distanze customer-seller...")
fact_with_keys['customer_seller_distance_km'] = fact_with_keys.apply(
    lambda row: haversine_distance(
        row['customer_latitude'], row['customer_longitude'],
        row['seller_latitude'], row['seller_longitude']
    ), axis=1
)

# Flag cross-state
fact_with_keys['is_cross_state_sale'] = (
    fact_with_keys['customer_state'] != fact_with_keys['seller_state']
)

# Shipping type
fact_with_keys['shipping_type'] = fact_with_keys.apply(
    lambda x: 'Local' if x['customer_state'] == x['seller_state'] else 'Interstate',
    axis=1
)

# Metriche derivate
fact_with_keys['freight_percentage'] = (
    fact_with_keys['freight_value'] / fact_with_keys['price'] * 100
).round(2)
fact_with_keys['net_revenue'] = fact_with_keys['price'] - fact_with_keys['freight_value']

# Seleziona colonne finali per fact table
fact_final = fact_with_keys[[
    'customer_key', 'product_key', 'time_key', 'seller_key', 'payment_key',
    'order_id', 'order_item_id',
    'price', 'freight_value', 'quantity', 'payment_value', 'review_score',
    'total_item_value', 'customer_seller_distance_km', 'is_cross_state_sale', 
    'shipping_type', 'freight_percentage', 'net_revenue'
]].copy()

# Rimuovi righe con foreign keys mancanti
before_count = len(fact_final)
fact_final = fact_final.dropna(subset=['customer_key', 'product_key', 'time_key', 'seller_key'])
after_count = len(fact_final)

print(f"✅ Foreign keys create e metriche calcolate!")
print(f"Righe prima pulizia: {before_count:,}")
print(f"Righe dopo pulizia: {after_count:,}")
print(f"Righe rimosse: {before_count - after_count:,}")
print(f"📊 Statistiche distanze:")
print(f"   - Media: {fact_final['customer_seller_distance_km'].mean():.1f} km")
print(f"   - Vendite cross-state: {fact_final['is_cross_state_sale'].sum():,}")

🔗 Creazione foreign keys e calcolo metriche geografiche...
📏 Calcolando distanze customer-seller...
✅ Foreign keys create e metriche calcolate!
Righe prima pulizia: 110,197
Righe dopo pulizia: 110,197
Righe rimosse: 0
📊 Statistiche distanze:
   - Media: 596.2 km
   - Vendite cross-state: 70,331


In [15]:
# =============================================================================
# CELLA 15: CARICA FACT_SALES ARRICCHITA
# =============================================================================
print("📊 Caricamento FACT_SALES...")

try:
    # Carica in batch per performance
    batch_size = 5000
    total_rows = len(fact_final)
    
    for i in range(0, total_rows, batch_size):
        batch = fact_final.iloc[i:i+batch_size]
        batch.to_sql('fact_sales', engine, if_exists='append', index=False, method='multi')
        print(f"Caricato batch {i//batch_size + 1}: {len(batch)} righe")
    
    # Verifica caricamento finale
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM fact_sales"))
        count = result.fetchone()[0]
        print(f"✅ FACT_SALES caricata: {count:,} righe")
        
        # Statistiche finali
        stats = conn.execute(text("""
            SELECT 
                MIN(price) as min_price,
                MAX(price) as max_price,
                AVG(price) as avg_price,
                SUM(price) as total_revenue,
                AVG(customer_seller_distance_km) as avg_distance,
                SUM(CASE WHEN is_cross_state_sale THEN 1 ELSE 0 END) as cross_state_sales
            FROM fact_sales
            WHERE price IS NOT NULL
        """))
        row = stats.fetchone()
        
        if row and row[0] is not None:
            print(f"📈 Statistiche finali:")
            print(f"   Prezzo min: €{row[0]:.2f}")
            print(f"   Prezzo max: €{row[1]:.2f}")
            print(f"   Prezzo medio: €{row[2]:.2f}")
            print(f"   Revenue totale: €{row[3]:,.2f}")
            print(f"   Distanza media: {row[4]:.1f} km")
            print(f"   Vendite cross-state: {row[5]:,}")
        
except Exception as e:
    print(f"❌ Errore caricamento FACT_SALES: {e}")

📊 Caricamento FACT_SALES...
Caricato batch 1: 5000 righe
Caricato batch 2: 5000 righe
Caricato batch 3: 5000 righe
Caricato batch 4: 5000 righe
Caricato batch 5: 5000 righe
Caricato batch 6: 5000 righe
Caricato batch 7: 5000 righe
Caricato batch 8: 5000 righe
Caricato batch 9: 5000 righe
Caricato batch 10: 5000 righe
Caricato batch 11: 5000 righe
Caricato batch 12: 5000 righe
Caricato batch 13: 5000 righe
Caricato batch 14: 5000 righe
Caricato batch 15: 5000 righe
Caricato batch 16: 5000 righe
Caricato batch 17: 5000 righe
Caricato batch 18: 5000 righe
Caricato batch 19: 5000 righe
Caricato batch 20: 5000 righe
Caricato batch 21: 5000 righe
Caricato batch 22: 5000 righe
Caricato batch 23: 197 righe
✅ FACT_SALES caricata: 110,197 righe
📈 Statistiche finali:
   Prezzo min: €0.85
   Prezzo max: €6735.00
   Prezzo medio: €119.98
   Revenue totale: €13,221,498.11
   Distanza media: 596.2 km
   Vendite cross-state: 70,331


In [16]:
# =============================================================================
# CELLA 16: CREAZIONE VISTE MATERIALIZZATE
# =============================================================================
print("\n🔨 Creazione viste materializzate per performance...")

try:
    with engine.connect() as conn:
        # Vista geografica
        print("📍 Creando vista materializzata geografica...")
        conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS mv_geographic_sales"))
        conn.execute(text("""
            CREATE MATERIALIZED VIEW mv_geographic_sales AS
            SELECT 
                dc.customer_state,
                ds.seller_state,
                dg_c.region as customer_region,
                dg_s.region as seller_region,
                COUNT(*) as total_sales,
                AVG(fs.customer_seller_distance_km) as avg_distance,
                SUM(CASE WHEN fs.is_cross_state_sale THEN 1 ELSE 0 END) as cross_state_sales,
                SUM(fs.price) as total_revenue,
                AVG(fs.freight_value) as avg_freight,
                AVG(fs.freight_percentage) as avg_freight_percentage
            FROM fact_sales fs
            JOIN dim_customer dc ON fs.customer_key = dc.customer_key
            JOIN dim_seller ds ON fs.seller_key = ds.seller_key
            LEFT JOIN dim_geography dg_c ON dc.geography_key = dg_c.geography_key
            LEFT JOIN dim_geography dg_s ON ds.geography_key = dg_s.geography_key
            GROUP BY dc.customer_state, ds.seller_state, dg_c.region, dg_s.region
        """))
        
        # Vista categorie
        print("📦 Creando vista materializzata categorie...")
        conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS mv_category_performance"))
        conn.execute(text("""
            CREATE MATERIALIZED VIEW mv_category_performance AS
            SELECT 
                dp.product_category_name,
                dp.product_category_name_english,
                dp.weight_category,
                dp.size_category,
                COUNT(*) as total_sales,
                SUM(fs.price) as total_revenue,
                AVG(fs.review_score) as avg_rating,
                AVG(fs.customer_seller_distance_km) as avg_shipping_distance,
                AVG(fs.freight_percentage) as avg_freight_percentage
            FROM fact_sales fs
            JOIN dim_product dp ON fs.product_key = dp.product_key
            WHERE dp.product_category_name_english IS NOT NULL
            GROUP BY dp.product_category_name, dp.product_category_name_english, 
                     dp.weight_category, dp.size_category
        """))
        
        # Vista performance temporale
        print("📅 Creando vista materializzata temporale...")
        conn.execute(text("DROP MATERIALIZED VIEW IF EXISTS mv_temporal_performance"))
        conn.execute(text("""
            CREATE MATERIALIZED VIEW mv_temporal_performance AS
            SELECT 
                dt.year,
                dt.quarter,
                dt.month_name,
                dt.season_brazil,
                dt.is_weekend,
                COUNT(*) as total_sales,
                SUM(fs.price) as total_revenue,
                AVG(fs.price) as avg_order_value,
                COUNT(DISTINCT fs.customer_key) as unique_customers,
                AVG(fs.review_score) as avg_satisfaction
            FROM fact_sales fs
            JOIN dim_time dt ON fs.time_key = dt.time_key
            GROUP BY dt.year, dt.quarter, dt.month_name, dt.month_num, 
                     dt.season_brazil, dt.is_weekend
            ORDER BY dt.year, dt.month_num
        """))
        
        conn.commit()
        print("✅ Viste materializzate create con successo!")
        
except Exception as e:
    print(f"❌ Errore creazione viste: {e}")



🔨 Creazione viste materializzate per performance...
📍 Creando vista materializzata geografica...
📦 Creando vista materializzata categorie...
📅 Creando vista materializzata temporale...
✅ Viste materializzate create con successo!


In [17]:
# =============================================================================
# CELLA 17: VERIFICA FINALE
# =============================================================================
print("\n" + "="*60)
print("🎯 VERIFICA FINALE ETL COMPLETO")
print("="*60)

try:
    with engine.connect() as conn:
        # Conta righe in tutte le tabelle
        tables = ['dim_geography', 'dim_customer', 'dim_product', 'dim_seller', 
                  'dim_time', 'dim_payment', 'fact_sales']
        
        print("📊 CONTEGGIO RIGHE:")
        for table in tables:
            result = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
            count = result.fetchone()[0]
            print(f"   {table.upper()}: {count:,} righe")
        
        # Verifiche enrichment
        print("\n✨ VERIFICHE ENRICHMENT:")
        
        # Geografia
        result = conn.execute(text("""
            SELECT COUNT(DISTINCT region) as regions, COUNT(*) as total_zips
            FROM dim_geography
        """))
        geo_stats = result.fetchone()
        print(f"   📍 Regioni geografiche: {geo_stats[0]}")
        print(f"   📍 ZIP codes totali: {geo_stats[1]:,}")
        
        # Customers con coordinate
        result = conn.execute(text("""
            SELECT 
                COUNT(*) as total,
                SUM(CASE WHEN customer_latitude IS NOT NULL THEN 1 ELSE 0 END) as with_coords
            FROM dim_customer
        """))
        cust_stats = result.fetchone()
        print(f"   👥 Customers con coordinate: {cust_stats[1]:,} su {cust_stats[0]:,} ({cust_stats[1]/cust_stats[0]*100:.1f}%)")
        
        # Prodotti con traduzioni
        result = conn.execute(text("""
            SELECT 
                COUNT(DISTINCT product_category_name) as original,
                COUNT(DISTINCT product_category_name_english) as translated
            FROM dim_product
            WHERE product_category_name_english != product_category_name
        """))
        prod_stats = result.fetchone()
        print(f"   📦 Categorie con traduzione: {prod_stats[1]} su {prod_stats[0]}")
        
        # Fact con distanze
        result = conn.execute(text("""
            SELECT 
                AVG(customer_seller_distance_km) as avg_dist,
                MIN(customer_seller_distance_km) as min_dist,
                MAX(customer_seller_distance_km) as max_dist,
                SUM(CASE WHEN is_cross_state_sale THEN 1 ELSE 0 END) * 100.0 / COUNT(*) as cross_state_pct
            FROM fact_sales
            WHERE customer_seller_distance_km IS NOT NULL
        """))
        fact_stats = result.fetchone()
        print(f"   📏 Distanza media vendite: {fact_stats[0]:.1f} km")
        print(f"   📏 Range distanze: {fact_stats[1]:.1f} - {fact_stats[2]:.1f} km")
        print(f"   🚛 Vendite cross-state: {fact_stats[3]:.1f}%")
        
        # Test query complessa
        print("\n🧪 TEST QUERY COMPLESSA:")
        test_query = """
        SELECT 
            dp.product_category_name_english as category,
            dg.region,
            COUNT(*) as sales
        FROM fact_sales fs
        JOIN dim_product dp ON fs.product_key = dp.product_key
        JOIN dim_customer dc ON fs.customer_key = dc.customer_key
        JOIN dim_geography dg ON dc.geography_key = dg.geography_key
        WHERE dp.product_category_name_english IS NOT NULL
            AND dg.region != 'Unknown'
        GROUP BY dp.product_category_name_english, dg.region
        LIMIT 5
        """
        result = conn.execute(text(test_query))
        print("   ✅ Query multi-join con enrichment funzionante!")
        
        print("\n🎉 ETL COMPLETATO CON SUCCESSO!")
        print("\n📊 RIEPILOGO SCHEMA ENRICHED:")
        print("   ✅ 6 dimensioni")
        print("   ✅ Traduzioni categorie prodotti integrate")
        print("   ✅ Coordinate GPS per customers e sellers")
        print("   ✅ Distanze calcolate nella fact table")
        print("   ✅ Classificazioni prodotti (peso/dimensione)")
        print("   ✅ Categorizzazione pagamenti")
        print("   ✅ Stagionalità brasiliana")
        print("   ✅ 3 viste materializzate per performance")
        
except Exception as e:
    print(f"❌ Errore verifica finale: {e}")


🎯 VERIFICA FINALE ETL COMPLETO
📊 CONTEGGIO RIGHE:
   DIM_GEOGRAPHY: 19,015 righe
   DIM_CUSTOMER: 99,441 righe
   DIM_PRODUCT: 32,951 righe
   DIM_SELLER: 3,095 righe
   DIM_TIME: 634 righe
   DIM_PAYMENT: 28 righe
   FACT_SALES: 110,197 righe

✨ VERIFICHE ENRICHMENT:
   📍 Regioni geografiche: 5
   📍 ZIP codes totali: 19,015
   👥 Customers con coordinate: 99,163 su 99,441 (99.7%)
   📦 Categorie con traduzione: 64 su 64
   📏 Distanza media vendite: 596.2 km
   📏 Range distanze: 0.0 - 8677.9 km
   🚛 Vendite cross-state: 63.7%

🧪 TEST QUERY COMPLESSA:
   ✅ Query multi-join con enrichment funzionante!

🎉 ETL COMPLETATO CON SUCCESSO!

📊 RIEPILOGO SCHEMA ENRICHED:
   ✅ 6 dimensioni
   ✅ Traduzioni categorie prodotti integrate
   ✅ Coordinate GPS per customers e sellers
   ✅ Distanze calcolate nella fact table
   ✅ Classificazioni prodotti (peso/dimensione)
   ✅ Categorizzazione pagamenti
   ✅ Stagionalità brasiliana
   ✅ 3 viste materializzate per performance
