# **EXTRACT, LOAD, & TRANSFORM**

In [3]:
%pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.4.0-cp39-cp39-win_amd64.whl.metadata (7.5 kB)
Downloading mysql_connector_python-9.4.0-cp39-cp39-win_amd64.whl (16.4 MB)
   ---------------------------------------- 0.0/16.4 MB ? eta -:--:--
    --------------------------------------- 0.3/16.4 MB ? eta -:--:--
   - -------------------------------------- 0.5/16.4 MB 1.5 MB/s eta 0:00:11
   - -------------------------------------- 0.8/16.4 MB 1.3 MB/s eta 0:00:12
   -- ------------------------------------- 1.0/16.4 MB 1.3 MB/s eta 0:00:12
   --- ------------------------------------ 1.3/16.4 MB 1.3 MB/s eta 0:00:12
   --- ------------------------------------ 1.6/16.4 MB 1.3 MB/s eta 0:00:12
   ---- ----------------------------------- 1.8/16.4 MB 1.3 MB/s eta 0:00:12
   ---- ----------------------------------- 1.8/16.4 MB 1.3 MB/s eta 0:00:12
   ----- ---------------------------------- 2.1/16.4 MB 1.2 MB/s eta 0:00:12
   ----- ---------------------------------- 2.4/16

In [5]:
import pandas as pd
from sqlalchemy import create_engine, text
import os
import time

In [6]:
DB_USER = 'root'
DB_PASSWORD = ''  
DB_HOST = 'localhost'
DB_PORT = '3306'
DB_NAME = 'olist_elt_db'

DATASET_DIR = '../raw/'  

FILES_MAPPING = {
    'olist_customers_dataset.csv': 'raw_customers',
    'olist_geolocation_dataset.csv': 'raw_geolocation',
    'olist_order_items_dataset.csv': 'raw_order_items',
    'olist_order_payments_dataset.csv': 'raw_order_payments',
    'olist_order_reviews_dataset.csv': 'raw_order_reviews',
    'olist_orders_dataset.csv': 'raw_orders',
    'olist_products_dataset.csv': 'raw_products',
    'olist_sellers_dataset.csv': 'raw_sellers',
    'product_category_name_translation.csv': 'raw_category_translation',
    'brazil.inflation.monthly (statbureau.org).csv': 'raw_brazil_inflation'
}

In [7]:
def init_db_connection():
    str_conn_root = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}"
    engine_root = create_engine(str_conn_root)
    with engine_root.connect() as conn:
        conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {DB_NAME}"))
    
    return create_engine(f"{str_conn_root}/{DB_NAME}")

def load_raw_data():
    engine = init_db_connection()
    print("\n--- MULAI PROSES ELT: PHASE 1 (INGEST RAW DATA) ---")

    for csv_file, table_name in FILES_MAPPING.items():
        file_path = os.path.join(DATASET_DIR, csv_file)
        
        if not os.path.exists(file_path):
            print(f"‚ö†Ô∏è  SKIP: File {csv_file} tidak ditemukan.")
            continue
            
        print(f"üìÇ Processing: {csv_file} -> {table_name}")
        
        try:
            df = pd.read_csv(file_path, dtype=str)
            df.columns = [c.strip().lower() for c in df.columns]
            df.to_sql(name=table_name, con=engine, if_exists='replace', index=False, chunksize=5000)
            print(f"   ‚úÖ Sukses! ({len(df)} baris)")
            
        except Exception as e:
            print(f"   ‚ùå GAGAL: {e}")

if __name__ == "__main__":
    load_raw_data()


--- MULAI PROSES ELT: PHASE 1 (INGEST RAW DATA) ---
üìÇ Processing: olist_customers_dataset.csv -> raw_customers
   ‚úÖ Sukses! (99441 baris)
üìÇ Processing: olist_geolocation_dataset.csv -> raw_geolocation
   ‚úÖ Sukses! (1000163 baris)
üìÇ Processing: olist_order_items_dataset.csv -> raw_order_items
   ‚úÖ Sukses! (112650 baris)
üìÇ Processing: olist_order_payments_dataset.csv -> raw_order_payments
   ‚úÖ Sukses! (103886 baris)
üìÇ Processing: olist_order_reviews_dataset.csv -> raw_order_reviews
   ‚úÖ Sukses! (99224 baris)
üìÇ Processing: olist_orders_dataset.csv -> raw_orders
   ‚úÖ Sukses! (99441 baris)
üìÇ Processing: olist_products_dataset.csv -> raw_products
   ‚úÖ Sukses! (32951 baris)
üìÇ Processing: olist_sellers_dataset.csv -> raw_sellers
   ‚úÖ Sukses! (3095 baris)
üìÇ Processing: product_category_name_translation.csv -> raw_category_translation
   ‚úÖ Sukses! (71 baris)
üìÇ Processing: brazil.inflation.monthly (statbureau.org).csv -> raw_brazil_inflation
   ‚úÖ

Proses Indexing pada data raw

In [8]:
connection_str = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_str)

def add_indexes_to_raw_data():
    print("\n--- OPTIMASI: MENAMBAHKAN INDEX KE RAW DATA ---")
    indexes = [
        "CREATE INDEX idx_orders_order_id ON raw_orders(order_id(50));",
        "CREATE INDEX idx_items_order_id ON raw_order_items(order_id(50));",
        "CREATE INDEX idx_items_product_id ON raw_order_items(product_id(50));",
        "CREATE INDEX idx_products_product_id ON raw_products(product_id(50));"
    ]
    
    with engine.connect() as conn:
        for sql in indexes:
            try:
                print(f"‚öôÔ∏è  Executing: {sql.split('ON')[1]}...")
                conn.execute(text(sql))
            except Exception as e:
                print(f"   ‚ö†Ô∏è Note: {e}")
        print("‚úÖ Indexing Selesai!")

if __name__ == "__main__":
    add_indexes_to_raw_data() 



--- OPTIMASI: MENAMBAHKAN INDEX KE RAW DATA ---
‚öôÔ∏è  Executing:  raw_orders(order_id(50));...
‚öôÔ∏è  Executing:  raw_order_items(order_id(50));...
‚öôÔ∏è  Executing:  raw_order_items(product_id(50));...
‚öôÔ∏è  Executing:  raw_products(product_id(50));...
‚úÖ Indexing Selesai!


In [9]:
connection_str = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_str)

TRANSFORM_QUERIES = [
    {
        "name": "1. Membuat Dimensi Customers (Cleaned)",
        "sql": """
            CREATE TABLE IF NOT EXISTS dim_customers AS
            SELECT DISTINCT 
                customer_id, 
                customer_unique_id, 
                customer_zip_code_prefix, 
                customer_city, 
                customer_state
            FROM raw_customers
            WHERE customer_id IS NOT NULL;
        """
    },
    {
        "name": "2. Membuat Dimensi Products (Enriched)",
        "sql": """
            CREATE TABLE IF NOT EXISTS dim_products AS
            SELECT DISTINCT 
                p.product_id,
                COALESCE(t.product_category_name_english, p.product_category_name, 'Unknown') as category_name,
                p.product_weight_g, 
                p.product_length_cm, 
                p.product_height_cm, 
                p.product_width_cm
            FROM raw_products p
            LEFT JOIN raw_category_translation t ON p.product_category_name = t.product_category_name
            WHERE p.product_id IS NOT NULL;
        """
    },
    {
        "name": "3. Membuat Fact Sales (Cleaned & Integrated)",
        "sql": """
            CREATE TABLE IF NOT EXISTS fact_sales AS
            SELECT DISTINCT 
                o.order_id, 
                o.customer_id, 
                oi.product_id, 
                oi.seller_id, 
                o.order_status,
                CAST(NULLIF(o.order_purchase_timestamp, '') AS DATETIME) as purchase_date,
                CAST(NULLIF(o.order_approved_at, '') AS DATETIME) as approved_date,
                CAST(NULLIF(o.order_delivered_carrier_date, '') AS DATETIME) as carrier_date,
                CAST(NULLIF(o.order_delivered_customer_date, '') AS DATETIME) as delivered_date,
                CAST(NULLIF(o.order_estimated_delivery_date, '') AS DATETIME) as estimated_date,
                CAST(oi.price AS DECIMAL(10,2)) as price,
                CAST(oi.freight_value AS DECIMAL(10,2)) as freight_value
            FROM raw_orders o
            JOIN raw_order_items oi ON o.order_id = oi.order_id
            INNER JOIN dim_products p ON oi.product_id = p.product_id 
            WHERE 
                NULLIF(o.order_purchase_timestamp, '') IS NOT NULL 
                AND CAST(oi.price AS DECIMAL(10,2)) >= 0 
                AND CAST(oi.freight_value AS DECIMAL(10,2)) >= 0;
        """
    }
]

def run_transformations():
    print("\n--- MULAI PROSES ELT: PHASE 2 (TRANSFORM WAREHOUSE - FIX DATE & CLEANING) ---")
    with engine.connect() as conn:
        for task in TRANSFORM_QUERIES:
            print(f"üîÑ Running: {task['name']}...")
            try:
                table_name = task['sql'].split("CREATE TABLE IF NOT EXISTS ")[1].split(" ")[0]
                conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
                
                conn.execute(text(task['sql']))
                print("   ‚úÖ Selesai!")
            except Exception as e:
                print(f"   ‚ùå GAGAL: {e}")

if __name__ == "__main__":
    run_transformations()


--- MULAI PROSES ELT: PHASE 2 (TRANSFORM WAREHOUSE - FIX DATE & CLEANING) ---
üîÑ Running: 1. Membuat Dimensi Customers (Cleaned)...
   ‚úÖ Selesai!
üîÑ Running: 2. Membuat Dimensi Products (Enriched)...
   ‚úÖ Selesai!
üîÑ Running: 3. Membuat Fact Sales (Cleaned & Integrated)...
   ‚úÖ Selesai!


# **INFLATION ANALYSIS**

In [11]:
connection_str = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_str)

INFLATION_QUERIES = [
    {
        "name": "1. Cleaning & Unpivot Data Inflasi",
        "desc": "Unpivot Data Inflasi (Subquery Method)",
        "sql": """
            CREATE TABLE IF NOT EXISTS dim_brazil_inflation AS
            SELECT * FROM (
                SELECT CAST(year AS UNSIGNED) as year, 1 as month, CAST(january AS DECIMAL(10,2)) as inflation_rate FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 2, CAST(february AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 3, CAST(march AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 4, CAST(april AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 5, CAST(may AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 6, CAST(june AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 7, CAST(july AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 8, CAST(august AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 9, CAST(september AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 10, CAST(october AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 11, CAST(november AS DECIMAL(10,2)) FROM raw_brazil_inflation
                UNION ALL
                SELECT CAST(year AS UNSIGNED), 12, CAST(december AS DECIMAL(10,2)) FROM raw_brazil_inflation
            ) AS unpivoted_data
            WHERE inflation_rate IS NOT NULL
            ORDER BY year DESC, month DESC;
        """
    },
    {
        "name": "2. Agregasi Penjualan Per Bulan",
        "desc": "Menghitung total pembelian customer per bulan.",
        "sql": """
            CREATE TABLE IF NOT EXISTS fact_monthly_purchasing_power AS
            SELECT 
                CAST(YEAR(purchase_date) AS UNSIGNED) as sales_year,
                CAST(MONTH(purchase_date) AS UNSIGNED) as sales_month,
                COUNT(DISTINCT order_id) as total_transactions,
                SUM(price) as total_spending_revenue,
                AVG(price) as avg_spending_per_item
            FROM fact_sales
            WHERE purchase_date IS NOT NULL
            GROUP BY 1, 2;
        """
    },
    {
        "name": "3. Final Table: Korelasi Inflasi vs Daya Beli",
        "desc": "Menggabungkan Data Inflasi dan Data Penjualan (SAFE DATE CONSTRUCT).",
        "sql": """
            CREATE TABLE IF NOT EXISTS mart_inflation_analysis AS
            SELECT 
                s.sales_year,
                s.sales_month,
                
                CAST(
                    CONCAT(
                        CAST(s.sales_year AS CHAR), '-', 
                        LPAD(CAST(s.sales_month AS CHAR), 2, '0'), 
                        '-01'
                    ) 
                AS DATE) as period_date,
                
                s.total_transactions,
                s.total_spending_revenue,
                s.avg_spending_per_item,
                i.inflation_rate
            FROM fact_monthly_purchasing_power s
            JOIN dim_brazil_inflation i 
                ON s.sales_year = i.year 
                AND s.sales_month = i.month
            ORDER BY s.sales_year DESC, s.sales_month DESC;
        """
    }
]

def run_inflation_analysis():
    print("\n--- MULAI PROSES ELT: PHASE 3 (INFLATION ANALYSIS - FINAL FIX) ---")
    
    with engine.connect() as conn:
        for task in INFLATION_QUERIES:
            print(f"üîÑ Running: {task['name']}...")
            start_time = time.time()
            try:
                table_name = task['sql'].split("CREATE TABLE IF NOT EXISTS ")[1].split(" ")[0]
                conn.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
                conn.execute(text(task['sql']))
                duration = time.time() - start_time
                print(f"   ‚úÖ Selesai! ({duration:.2f} detik)")
            except Exception as e:
                print(f"   ‚ùå GAGAL: {e}")

if __name__ == "__main__":
    run_inflation_analysis()


--- MULAI PROSES ELT: PHASE 3 (INFLATION ANALYSIS - FINAL FIX) ---
üîÑ Running: 1. Cleaning & Unpivot Data Inflasi...
   ‚úÖ Selesai! (0.04 detik)
üîÑ Running: 2. Agregasi Penjualan Per Bulan...
   ‚úÖ Selesai! (4.45 detik)
üîÑ Running: 3. Final Table: Korelasi Inflasi vs Daya Beli...
   ‚úÖ Selesai! (0.03 detik)


# **VALIDATION**

In [12]:
def run_data_quality_checks():
    print("\n=== MULAI ELT DATA QUALITY ASSURANCE (REVISI) ===\n")
    
    validations = [
        {
            "rule": "1. Duplicate Row Check (Revisi)",
            "desc": "Memastikan tidak ada duplikasi data baris yang sama persis (karena order_id boleh berulang)",
            "sql": """
                SELECT COUNT(*) - COUNT(DISTINCT order_id, product_id, seller_id, order_status) 
                FROM fact_sales;
            """,
            "threshold": 0 
        },
        {
            "rule": "2. Null Check",
            "desc": "Memastikan tidak ada Revenue/Price yang NULL di Fact Sales",
            "sql": """
                SELECT COUNT(*) as null_count 
                FROM fact_sales 
                WHERE price IS NULL OR freight_value IS NULL;
            """,
            "threshold": 0
        },
        {
            "rule": "3. Range Check",
            "desc": "Memastikan tidak ada harga atau ongkir bernilai negatif",
            "sql": """
                SELECT COUNT(*) as negative_values 
                FROM fact_sales 
                WHERE price < 0 OR freight_value < 0;
            """,
            "threshold": 0
        },
        {
            "rule": "4. Datatype Consistency",
            "desc": "Memastikan konversi tanggal berhasil (Tidak ada NULL)",
            "sql": """
                SELECT COUNT(*) as invalid_dates 
                FROM fact_sales 
                WHERE purchase_date IS NULL;
            """,
            "threshold": 0 
        },
        {
            "rule": "5. Referential Integrity",
            "desc": "Memastikan semua Product ID di Sales ada di Dimensi Produk",
            "sql": """
                SELECT COUNT(*) as orphan_records
                FROM fact_sales f
                LEFT JOIN dim_products p ON f.product_id = p.product_id
                WHERE p.product_id IS NULL;
            """,
            "threshold": 0
        },
        {
            "rule": "6. Distribusi Data (Distribution Check)",
            "desc": "Memastikan Rate Inflasi berada dalam rentang wajar (-10% sampai 100%)",
            "sql": """
                SELECT COUNT(*) as outlier_count
                FROM dim_brazil_inflation
                WHERE inflation_rate < -10 OR inflation_rate > 100;
            """,
            "threshold": 0
        }
    ]
    
    with engine.connect() as conn:
        all_passed = True
        
        for v in validations:
            print(f"üîé Checking: {v['rule']}")
            print(f"   Context : {v['desc']}")
            
            try:
                result = conn.execute(text(v['sql'])).fetchone()[0]
                
                if result <= v['threshold']:
                    print(f"   ‚úÖ PASS (Result: {result} rows)")
                else:
                    print(f"   ‚ùå FAIL (Result: {result} rows found, Threshold allowed: {v['threshold']})")
                    all_passed = False
            
            except Exception as e:
                print(f"   ‚ö†Ô∏è ERROR execution: {e}")
                all_passed = False
            
            print("-" * 50)

        if all_passed:
            print("\nüéâ SELURUH DATA QUALITY CHECK BERHASIL! Data Warehouse Siap Digunakan.")
        else:
            print("\n‚ö†Ô∏è ADA VALIDASI YANG GAGAL. Periksa laporan di atas.")

if __name__ == "__main__":
    run_data_quality_checks()


=== MULAI ELT DATA QUALITY ASSURANCE (REVISI) ===

üîé Checking: 1. Duplicate Row Check (Revisi)
   Context : Memastikan tidak ada duplikasi data baris yang sama persis (karena order_id boleh berulang)
   ‚úÖ PASS (Result: 0 rows)
--------------------------------------------------
üîé Checking: 2. Null Check
   Context : Memastikan tidak ada Revenue/Price yang NULL di Fact Sales
   ‚úÖ PASS (Result: 0 rows)
--------------------------------------------------
üîé Checking: 3. Range Check
   Context : Memastikan tidak ada harga atau ongkir bernilai negatif
   ‚úÖ PASS (Result: 0 rows)
--------------------------------------------------
üîé Checking: 4. Datatype Consistency
   Context : Memastikan konversi tanggal berhasil (Tidak ada NULL)
   ‚úÖ PASS (Result: 0 rows)
--------------------------------------------------
üîé Checking: 5. Referential Integrity
   Context : Memastikan semua Product ID di Sales ada di Dimensi Produk
   ‚úÖ PASS (Result: 0 rows)
-----------------------------

In [13]:
from IPython.display import display


connection_str = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_str)

target_tables = [
    'dim_customers',
    'dim_products',
    'dim_brazil_inflation',
    'fact_sales',
    'fact_monthly_purchasing_power',
    'mart_inflation_analysis'
]

# --- 3. FUNGSI ANALISIS ---
def inspect_table_quality(table_name, engine):
    print(f"\n{'='*60}")
    print(f"üìÇ TABEL: {table_name.upper()}")
    print(f"{'='*60}")
    
    try:
        # Load data ke DataFrame
        query = f"SELECT * FROM {table_name}"
        df = pd.read_sql(query, engine)
        
        # 1. Dimensi (Baris & Kolom)
        rows, cols = df.shape
        
        # 2. Cek Duplikat Baris (Exact Match)
        duplicates = df.duplicated().sum()
        
        # 3. Cek Missing Values (NULL)
        null_counts = df.isnull().sum()
        total_nulls = null_counts.sum()
        
        # --- PRINT LAPORAN ---
        print(f"üìä STATISTIK UTAMA:")
        print(f"   ‚Ä¢ Total Baris    : {rows:,}")
        print(f"   ‚Ä¢ Total Kolom    : {cols}")
        print(f"   ‚Ä¢ Data Duplikat  : {duplicates} baris")
        
        print(f"\nüîç ANALISIS NULL (MISSING VALUES):")
        if total_nulls == 0:
            print("   ‚úÖ CLEAN! Tidak ada data kosong (NULL) di tabel ini.")
        else:
            print("   ‚ö†Ô∏è Ditemukan kolom dengan NULL:")
            # Tampilkan hanya kolom yang punya NULL
            print(null_counts[null_counts > 0].to_string())
            
        print(f"\nüëÄ PREVIEW DATA (3 Baris Teratas):")
        display(df.head(3))
        
        print(f"\nüìã TIPE DATA KOLOM:")
        print(df.dtypes)

    except Exception as e:
        print(f"‚ùå GAGAL MEMBACA TABEL: {e}")

# --- 4. EKSEKUSI LOOPING ---
if __name__ == "__main__":
    print("MENGAMBIL DATA DARI WAREHOUSE... MOHON TUNGGU.")
    for table in target_tables:
        inspect_table_quality(table, engine)
        print("\n" + "-"*60 + "\n") # Pemisah antar tabel

MENGAMBIL DATA DARI WAREHOUSE... MOHON TUNGGU.

üìÇ TABEL: DIM_CUSTOMERS
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 99,441
   ‚Ä¢ Total Kolom    : 5
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚úÖ CLEAN! Tidak ada data kosong (NULL) di tabel ini.

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP



üìã TIPE DATA KOLOM:
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix    object
customer_city               object
customer_state              object
dtype: object

------------------------------------------------------------


üìÇ TABEL: DIM_PRODUCTS
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 32,951
   ‚Ä¢ Total Kolom    : 6
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚ö†Ô∏è Ditemukan kolom dengan NULL:
product_weight_g     2
product_length_cm    2
product_height_cm    2
product_width_cm     2

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,product_id,category_name,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,e3e020af31d4d89d2602272b315c3f6e,health_beauty,75,21,7,13
1,c5d8079278e912d7e3b6beb48ecb56e8,health_beauty,1300,19,28,17
2,36555a2f528d7b2a255c504191445d39,health_beauty,1467,23,16,20



üìã TIPE DATA KOLOM:
product_id           object
category_name        object
product_weight_g     object
product_length_cm    object
product_height_cm    object
product_width_cm     object
dtype: object

------------------------------------------------------------


üìÇ TABEL: DIM_BRAZIL_INFLATION
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 469
   ‚Ä¢ Total Kolom    : 3
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚úÖ CLEAN! Tidak ada data kosong (NULL) di tabel ini.

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,year,month,inflation_rate
0,2019,1,0.32
1,2018,12,0.15
2,2018,11,-0.21



üìã TIPE DATA KOLOM:
year                int64
month               int64
inflation_rate    float64
dtype: object

------------------------------------------------------------


üìÇ TABEL: FACT_SALES
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 102,425
   ‚Ä¢ Total Kolom    : 12
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚ö†Ô∏è Ditemukan kolom dengan NULL:
approved_date       14
carrier_date      1028
delivered_date    2230

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,order_id,customer_id,product_id,seller_id,order_status,purchase_date,approved_date,carrier_date,delivered_date,estimated_date,price,freight_value
0,a41753c6a1d8accb89732e36243432d7,9c2f403519bcb363683a7179f0f94bd4,e3e020af31d4d89d2602272b315c3f6e,94144541854e298c2d976cb893b81343,delivered,2017-05-12 10:51:43,2017-05-12 11:05:13,2017-05-16 09:14:33,2017-05-19 14:13:02,2017-06-05,29.9,15.79
1,6a1594b5f5cfc5bac6dcdc3f48f22b5e,397cbe809e45d41179dcdd64966e4747,c5d8079278e912d7e3b6beb48ecb56e8,abcd2cb37d46c2c8fb1bf071c859fc5b,delivered,2018-03-06 18:14:49,2018-03-06 18:30:16,2018-03-07 22:38:47,2018-03-14 14:38:53,2018-04-12,95.9,27.22
2,f6fbf7907913892ffc12ada3bff286ba,1d7d832199ca5ea415e212d226ae106c,c5d8079278e912d7e3b6beb48ecb56e8,abcd2cb37d46c2c8fb1bf071c859fc5b,delivered,2017-07-31 18:47:11,2017-07-31 19:03:15,2017-08-02 17:49:51,2017-08-10 20:04:29,2017-08-30,89.9,38.18



üìã TIPE DATA KOLOM:
order_id                  object
customer_id               object
product_id                object
seller_id                 object
order_status              object
purchase_date     datetime64[ns]
approved_date     datetime64[ns]
carrier_date      datetime64[ns]
delivered_date    datetime64[ns]
estimated_date    datetime64[ns]
price                    float64
freight_value            float64
dtype: object

------------------------------------------------------------


üìÇ TABEL: FACT_MONTHLY_PURCHASING_POWER
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 24
   ‚Ä¢ Total Kolom    : 5
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚úÖ CLEAN! Tidak ada data kosong (NULL) di tabel ini.

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,sales_year,sales_month,total_transactions,total_spending_revenue,avg_spending_per_item
0,2016,9,3,177.38,44.345
1,2016,10,308,46276.02,141.516881
2,2016,12,1,10.9,10.9



üìã TIPE DATA KOLOM:
sales_year                  int64
sales_month                 int64
total_transactions          int64
total_spending_revenue    float64
avg_spending_per_item     float64
dtype: object

------------------------------------------------------------


üìÇ TABEL: MART_INFLATION_ANALYSIS
üìä STATISTIK UTAMA:
   ‚Ä¢ Total Baris    : 24
   ‚Ä¢ Total Kolom    : 7
   ‚Ä¢ Data Duplikat  : 0 baris

üîç ANALISIS NULL (MISSING VALUES):
   ‚úÖ CLEAN! Tidak ada data kosong (NULL) di tabel ini.

üëÄ PREVIEW DATA (3 Baris Teratas):


Unnamed: 0,sales_year,sales_month,period_date,total_transactions,total_spending_revenue,avg_spending_per_item,inflation_rate
0,2018,9,2018-09-01,1,145.0,145.0,0.48
1,2018,8,2018-08-01,6452,816588.21,121.371613,-0.09
2,2018,7,2018-07-01,6273,837582.82,129.017686,0.33



üìã TIPE DATA KOLOM:
sales_year                  int64
sales_month                 int64
period_date                object
total_transactions          int64
total_spending_revenue    float64
avg_spending_per_item     float64
inflation_rate            float64
dtype: object

------------------------------------------------------------



In [15]:
OUTPUT_DIR = '../warehouse/elt/'

connection_str = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(connection_str)

TABLES_TO_EXPORT = [
    'mart_inflation_analysis',
    'fact_sales',
    'dim_products',
    'dim_customers'
]

def export_tables():
    print("--- MULAI EXPORT TABEL KE CSV ---\n")
    
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"üìÅ Membuat folder: {OUTPUT_DIR}")
    
    for table in TABLES_TO_EXPORT:
        print(f"üíæ Exporting: {table}...", end=" ")
        
        try:
            query = f"SELECT * FROM {table}"
            df = pd.read_sql(query, engine)
            filename = os.path.join(OUTPUT_DIR, f"{table}.csv")
            df.to_csv(filename, index=False)
            
            print(f"‚úÖ Sukses! ({len(df)} baris) -> Tersimpan di {filename}")
            
        except Exception as e:
            print(f"‚ùå GAGAL: {e}")

    print("\nüéâ SEMUA PROSES EXPORT SELESAI.")

if __name__ == "__main__":
    export_tables()

--- MULAI EXPORT TABEL KE CSV ---

üíæ Exporting: mart_inflation_analysis... ‚úÖ Sukses! (24 baris) -> Tersimpan di ../warehouse/elt/mart_inflation_analysis.csv
üíæ Exporting: fact_sales... ‚úÖ Sukses! (102425 baris) -> Tersimpan di ../warehouse/elt/fact_sales.csv
üíæ Exporting: dim_products... ‚úÖ Sukses! (32951 baris) -> Tersimpan di ../warehouse/elt/dim_products.csv
üíæ Exporting: dim_customers... ‚úÖ Sukses! (99441 baris) -> Tersimpan di ../warehouse/elt/dim_customers.csv

üéâ SEMUA PROSES EXPORT SELESAI.
