In [1]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Database connection
from sqlalchemy import create_engine
from google.cloud import bigquery
from google.oauth2 import service_account
import os
from pathlib import Path

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Plot styling
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ All libraries imported successfully!
Pandas version: 2.1.4
Analysis started at: 2025-08-28 16:54:26


In [4]:
# Cell 2: BigQuery Authentication and Connection Setup

# Load environment variables from .env file
load_dotenv()

# Load credentials (path relative to Notebooks folder)
CREDENTIALS_PATH = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
PROJECT_ID = os.getenv('GCP_PROJECT_ID')
DATASET_ID = 'olist_marts'

# Authenticate with Google Cloud
if CREDENTIALS_PATH and os.path.exists(CREDENTIALS_PATH):
    credentials = service_account.Credentials.from_service_account_file(
        CREDENTIALS_PATH,
        scopes=['https://www.googleapis.com/auth/cloud-platform']
    )
    
    # Create BigQuery client
    bq_client = bigquery.Client(credentials=credentials, project=PROJECT_ID)
    
    # Create SQLAlchemy engine for pandas integration
    connection_string = f"bigquery://{PROJECT_ID}/{DATASET_ID}"
    engine = create_engine(connection_string, credentials_path=str(CREDENTIALS_PATH))
    
    print("✅ BigQuery authentication successful!")
    print(f"Project ID: {PROJECT_ID}")
    print(f"Dataset ID: {DATASET_ID}")
    
else:
    print("❌ Credentials file not found!")
    print(f"Expected location: {CREDENTIALS_PATH}")
    if CREDENTIALS_PATH:
        print(f"Resolved path: {os.path.abspath(CREDENTIALS_PATH)}")
    print("Please check your .env file and ensure GOOGLE_APPLICATION_CREDENTIALS is set correctly.")

# Test connection
try:
    test_query = f"SELECT COUNT(*) as table_count FROM `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.TABLES`"
    result = pd.read_sql(test_query, engine)
    print(f"✅ Connection test successful! Found {result.iloc[0,0]} tables in marts dataset.")
except Exception as e:
    print(f"❌ Connection test failed: {e}")

❌ Credentials file not found!
Expected location: ./credentials/sctp-dsai-468313-f5bc3e6b4ebe-innergritx.json
Resolved path: /home/chrisfkh/sctp-ds-ai/mod2/sctp-dsai-mod2-project-team4/Notebooks/credentials/sctp-dsai-468313-f5bc3e6b4ebe-innergritx.json
Please check your .env file and ensure GOOGLE_APPLICATION_CREDENTIALS is set correctly.
❌ Connection test failed: name 'engine' is not defined


In [5]:
# Cell 3: Load Star Schema Tables into DataFrames

print("📊 Loading star schema tables from BigQuery...")
print("-" * 50)

# Define table loading function for reusability
def load_table(table_name, sample_rows=None):
    """Load a table from BigQuery with optional row limiting for initial exploration"""
    try:
        if sample_rows:
            query = f"SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.{table_name}` LIMIT {sample_rows}"
        else:
            query = f"SELECT * FROM `{PROJECT_ID}.{DATASET_ID}.{table_name}`"
        
        df = pd.read_sql(query, engine)
        print(f"✅ {table_name}: {len(df):,} rows × {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"❌ Error loading {table_name}: {e}")
        return None

# Load all dimensions and fact table
# Start with smaller tables first
dim_date = load_table('dim_date')
dim_customers = load_table('dim_customers')
dim_products = load_table('dim_products')
dim_sellers = load_table('dim_sellers') 
dim_orders = load_table('dim_orders')
dim_payments = load_table('dim_payments')
dim_reviews = load_table('dim_reviews')

# Load fact table (this might be large)
print("\n📈 Loading fact table (this may take a moment)...")
fact_sales = load_table('fact_sales')

print(f"\n🎯 Data loading complete!")
print(f"Total fact records: {len(fact_sales):,}")

📊 Loading star schema tables from BigQuery...
--------------------------------------------------
✅ dim_date: 634 rows × 9 columns
✅ dim_customers: 99,441 rows × 8 columns
✅ dim_products: 32,951 rows × 11 columns
✅ dim_sellers: 3,095 rows × 7 columns
✅ dim_orders: 99,441 rows × 10 columns
✅ dim_payments: 99,440 rows × 10 columns
✅ dim_reviews: 98,673 rows × 8 columns

📈 Loading fact table (this may take a moment)...
✅ fact_sales: 112,650 rows × 20 columns

🎯 Data loading complete!
Total fact records: 112,650


In [6]:
# Cell 4: Data Exploration and Schema Validation

print("🔍 STAR SCHEMA EXPLORATION")
print("=" * 60)

# Function to display table info
def explore_table(df, table_name):
    if df is not None:
        print(f"\n📋 {table_name.upper()}")
        print("-" * 30)
        print(f"Shape: {df.shape}")
        print(f"Columns: {', '.join(df.columns.tolist())}")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
        
        # Show sample data
        print(f"\nSample data:")
        display(df.head(3))
        
        # Check for nulls
        null_counts = df.isnull().sum()
        if null_counts.sum() > 0:
            print(f"\nNull values:")
            print(null_counts[null_counts > 0])
        else:
            print("✅ No null values found")
        
        return True
    return False

# Explore each table
tables_info = [
    (fact_sales, 'Fact Sales'),
    (dim_customers, 'Dim Customers'),
    (dim_products, 'Dim Products'),
    (dim_sellers, 'Dim Sellers'),
    (dim_orders, 'Dim Orders'),
    (dim_payments, 'Dim Payments'),
    (dim_reviews, 'Dim Reviews'),
    (dim_date, 'Dim Date')
]

for df, name in tables_info:
    explore_table(df, name)

🔍 STAR SCHEMA EXPLORATION

📋 FACT SALES
------------------------------
Shape: (112650, 20)
Columns: order_item_sk, order_key, customer_key, product_key, seller_key, date_key, payment_key, review_key, item_price, freight_value, total_item_value, quantity, payment_value, total_installments, payment_methods_count, uses_credit_card, uses_boleto, uses_voucher, uses_debit_card, primary_payment_type
Memory usage: 87.92 MB

Sample data:


Unnamed: 0,order_item_sk,order_key,customer_key,product_key,seller_key,date_key,payment_key,review_key,item_price,freight_value,total_item_value,quantity,payment_value,total_installments,payment_methods_count,uses_credit_card,uses_boleto,uses_voucher,uses_debit_card,primary_payment_type
0,0159c6355a4e32f6ac68d838e2228150-1,0159c6355a4e32f6ac68d838e2228150,bf0176bae5facd261148fb881ccb8cc6,a00722035cea70bbf671b758459cde42,25be943a321c8938947bdaabca979a90,2018-06-15,0159c6355a4e32f6ac68d838e2228150,0159c6355a4e32f6ac68d838e2228150,189.0,53.11,242.11,1,242.11,15,1,False,False,False,False,CREDIT_CARD
1,02951078a21a2d9341ea16089a4d5031-1,02951078a21a2d9341ea16089a4d5031,6c51ed8c29ff8bc5f3b649dc7315b787,dd3575a8c5e2139f680a9816a15c8f2a,4869f7a5dfa277a7dca6462dcf3b52b2,2017-03-16,02951078a21a2d9341ea16089a4d5031,02951078a21a2d9341ea16089a4d5031,235.0,22.1,257.1,1,257.1,12,1,False,False,False,False,CREDIT_CARD
2,0332758ed6bf8f4c1ffa5b6bf5b5725d-1,0332758ed6bf8f4c1ffa5b6bf5b5725d,1f09757902b8881d55a29f5380d546de,3eef0cb94ba82de806bb30ab743c7655,7c67e1448b00f6e969d365cea6b010ab,2018-03-01,0332758ed6bf8f4c1ffa5b6bf5b5725d,0332758ed6bf8f4c1ffa5b6bf5b5725d,79.99,26.88,106.87,1,213.74,15,1,False,False,False,False,CREDIT_CARD



Null values:
payment_key      3
review_key     942
dtype: int64

📋 DIM CUSTOMERS
------------------------------
Shape: (99441, 8)
Columns: customer_key, customer_unique_id, customer_city, customer_state, customer_zip_prefix, customer_region, customer_economic_zone, customer_state_name
Memory usage: 54.81 MB

Sample data:


Unnamed: 0,customer_key,customer_unique_id,customer_city,customer_state,customer_zip_prefix,customer_region,customer_economic_zone,customer_state_name
0,005c611fb96be942e75d22129c370268,4619eedf5b092b15ebc4d916f8c99646,mossoro,RN,59610,Northeast,Northeast,Rio Grande do Norte
1,0062859a8f89e25c6b0e8dc905ccc59d,5cc1fbd02e6b7cd1113dd836fbbbcc13,teresina,PI,64073,Northeast,Northeast,Piauí
2,008d7d751e72ae684c0d31bc988b36fb,f3795e38cc5b655e26e94d1cc21263fa,teresina,PI,64078,Northeast,Northeast,Piauí


✅ No null values found

📋 DIM PRODUCTS
------------------------------
Shape: (32951, 11)
Columns: product_key, product_category_portuguese, product_weight_g, product_length_cm, product_height_cm, product_width_cm, product_photos_qty, product_name_length, product_description_length, product_volume_cm3, product_category_english
Memory usage: 9.21 MB

Sample data:


Unnamed: 0,product_key,product_category_portuguese,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_photos_qty,product_name_length,product_description_length,product_volume_cm3,product_category_english
0,003938452c98ff9ab28e9e7b4bfe97ab,moveis_sala,5050.0,50.0,8.0,50.0,2,52,420,20000.0,furniture_living_room
1,003c0b8f6580c850bd2e32044d2ac307,moveis_cozinha_area_de_servico_jantar_e_jardim,21100.0,100.0,20.0,60.0,2,55,705,120000.0,kitchen_dining_laundry_garden_furniture
2,004ffcbfa5aac82212a95bc972ea8a85,moveis_quarto,10000.0,65.0,35.0,35.0,2,44,776,79625.0,furniture_bedroom



Null values:
product_category_portuguese    610
product_category_english       610
dtype: int64

📋 DIM SELLERS
------------------------------
Shape: (3095, 7)
Columns: seller_key, seller_city, seller_state, seller_zip_prefix, seller_region, seller_economic_zone, seller_state_name
Memory usage: 1.45 MB

Sample data:


Unnamed: 0,seller_key,seller_city,seller_state,seller_zip_prefix,seller_region,seller_economic_zone,seller_state_name
0,0015a82c2db000af6aaaf3ae2ecb0532,santo andre,SP,9080,Southeast,Southeast,São Paulo
1,001cca7ae9ae17fb1caed9dfb1094831,cariacica,ES,29156,Southeast,Southeast,Espírito Santo
2,001e6ad469a905060d959994f1b41e4f,sao goncalo,RJ,24754,Southeast,Southeast,Rio de Janeiro


✅ No null values found

📋 DIM ORDERS
------------------------------
Shape: (99441, 10)
Columns: order_key, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date, days_to_delivery, delivery_vs_estimate_days, is_delivered_on_time
Memory usage: 23.39 MB

Sample data:


Unnamed: 0,order_key,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,days_to_delivery,delivery_vs_estimate_days,is_delivered_on_time
0,0017afd5076e074a48f1f1a4c7bac9c5,DELIVERED,2017-04-06 22:16:10+00:00,2017-04-06 22:25:19+00:00,2017-04-17 13:54:37+00:00,2017-05-23 08:32:07+00:00,2017-05-19 00:00:00+00:00,47.0,4.0,False
1,001c85b5f68d2be0cb0797afc9e8ce9a,DELIVERED,2017-11-24 19:19:18+00:00,2017-11-24 22:38:47+00:00,2017-11-27 12:42:15+00:00,2017-12-22 18:37:40+00:00,2017-12-14 00:00:00+00:00,28.0,8.0,False
2,001dbc16dc51075e987543d23a0507c7,DELIVERED,2017-01-28 13:17:57+00:00,2017-01-28 13:32:16+00:00,2017-02-01 15:59:46+00:00,2017-02-13 13:17:47+00:00,2017-03-20 00:00:00+00:00,16.0,-35.0,True



Null values:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
days_to_delivery                 2965
delivery_vs_estimate_days        2965
is_delivered_on_time             2965
dtype: int64

📋 DIM PAYMENTS
------------------------------
Shape: (99440, 10)
Columns: payment_key, uses_credit_card, uses_boleto, uses_voucher, uses_debit_card, primary_payment_type, total_payment_value, total_installments, payment_methods_count, payment_transactions_count
Memory usage: 18.20 MB

Sample data:


Unnamed: 0,payment_key,uses_credit_card,uses_boleto,uses_voucher,uses_debit_card,primary_payment_type,total_payment_value,total_installments,payment_methods_count,payment_transactions_count
0,0159c6355a4e32f6ac68d838e2228150,False,False,False,False,CREDIT_CARD,242.11,15,1,1
1,02951078a21a2d9341ea16089a4d5031,False,False,False,False,CREDIT_CARD,257.1,12,1,1
2,0332758ed6bf8f4c1ffa5b6bf5b5725d,False,False,False,False,CREDIT_CARD,213.74,15,1,1


✅ No null values found

📋 DIM REVIEWS
------------------------------
Shape: (98673, 8)
Columns: review_key, review_score, review_comment_title, review_comment_message, review_creation_date, has_comment_title, has_comment_message, days_to_review
Memory usage: 20.41 MB

Sample data:


Unnamed: 0,review_key,review_score,review_comment_title,review_comment_message,review_creation_date,has_comment_title,has_comment_message,days_to_review
0,0017afd5076e074a48f1f1a4c7bac9c5,1,,,2017-05-21 00:00:00+00:00,False,False,45
1,00276d5c3491fbf55305e26891040df9,4,,,2018-03-17 00:00:00+00:00,False,False,32
2,0032d07457ae9c806c79368d7d9ce96b,1,,,2018-04-19 00:00:00+00:00,False,False,40



Null values:
review_comment_title      87122
review_comment_message    57898
dtype: int64

📋 DIM DATE
------------------------------
Shape: (634, 9)
Columns: date_key, full_date, year, month, month_name, quarter, day_of_week, day_name, is_weekend
Memory usage: 0.16 MB

Sample data:


Unnamed: 0,date_key,full_date,year,month,month_name,quarter,day_of_week,day_name,is_weekend
0,2016-09-04,2016-09-04,2016,9,September,3,1,Sunday,True
1,2016-09-05,2016-09-05,2016,9,September,3,2,Monday,False
2,2016-09-13,2016-09-13,2016,9,September,3,3,Tuesday,False


✅ No null values found
