# 00 - Setup Validation

This notebook validates the project setup including:
- Python environment and dependencies
- GCP authentication and connectivity
- Data files presence
- Configuration validation

In [None]:
# Cell 1: Python Environment Check
import sys
print(f"Python version: {sys.version}")

import pandas as pd
import numpy as np
import sklearn
import xgboost

print(f"pandas: {pd.__version__}")
print(f"numpy: {np.__version__}")
print(f"scikit-learn: {sklearn.__version__}")
print(f"xgboost: {xgboost.__version__}")

In [None]:
# Cell 2: GCP Libraries Check
from google.cloud import aiplatform
from google.cloud import storage
import google.auth

print(f"google-cloud-aiplatform: {aiplatform.__version__}")

# Check authentication
credentials, project = google.auth.default()
print(f"\n✓ Authenticated")
print(f"  Project: {project}")

In [None]:
# Cell 3: Load and Validate Config
import yaml
from pathlib import Path

with open("../configs/config.yaml") as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"  GCP Project: {config['gcp']['project_id']}")
print(f"  Region: {config['gcp']['region']}")
print(f"  Bucket: {config['gcp']['bucket']}")
print(f"  Experiment: {config['vertex_ai']['experiment_name']}")

In [None]:
# Cell 4: Verify GCS Bucket Access
from google.cloud import storage

client = storage.Client(project=config["gcp"]["project_id"])
bucket_name = config["gcp"]["bucket"]

try:
    bucket = client.get_bucket(bucket_name)
    print(f"✓ Bucket '{bucket_name}' accessible")
    print(f"  Location: {bucket.location}")
    print(f"  Storage class: {bucket.storage_class}")
except Exception as e:
    print(f"✗ Bucket error: {e}")

In [None]:
# Cell 5: Verify Vertex AI Connection
from google.cloud import aiplatform

aiplatform.init(
    project=config["gcp"]["project_id"],
    location=config["gcp"]["region"],
)

print(f"✓ Vertex AI initialized")
print(f"  Project: {config['gcp']['project_id']}")
print(f"  Location: {config['gcp']['region']}")

# List existing experiments (if any)
experiments = aiplatform.Experiment.list()
print(f"  Existing experiments: {len(experiments)}")

In [None]:
# Cell 6: Create Experiment (if not exists)
from google.cloud import aiplatform

experiment_name = config["vertex_ai"]["experiment_name"]

try:
    experiment = aiplatform.Experiment.create(
        experiment_name=experiment_name,
        description="Olist Customer Intelligence Platform - ML Experiments"
    )
    print(f"✓ Created experiment: {experiment_name}")
except Exception as e:
    # Experiment might already exist
    experiment = aiplatform.Experiment(experiment_name=experiment_name)
    print(f"✓ Using existing experiment: {experiment_name}")

In [None]:
# Cell 7: Verify Data Files
from pathlib import Path

data_path = Path("../data/raw")
expected_files = [
    "olist_orders_dataset.csv",
    "olist_order_items_dataset.csv",
    "olist_products_dataset.csv",
    "olist_customers_dataset.csv",
    "olist_sellers_dataset.csv",
    "olist_order_payments_dataset.csv",
    "olist_order_reviews_dataset.csv",
    "olist_geolocation_dataset.csv",
    "product_category_name_translation.csv",
]

print("Data files:")
all_present = True
for f in expected_files:
    path = data_path / f
    if path.exists():
        print(f"  ✓ {f}")
    else:
        print(f"  ✗ {f} - MISSING")
        all_present = False

if all_present:
    print("\n✓ All 9 data files present")
else:
    print("\n✗ Some data files are missing. Please download from Kaggle.")

In [None]:
# Cell 8: Quick Data Overview (only runs if files are present)
if all_present:
    print("Dataset Overview:\n")
    total_memory = 0

    for f in expected_files:
        df = pd.read_csv(data_path / f)
        mem = df.memory_usage(deep=True).sum() / 1024**2
        total_memory += mem
        print(f"{f}")
        print(f"  Rows: {df.shape[0]:,} | Cols: {df.shape[1]} | Memory: {mem:.2f} MB\n")

    print(f"Total memory: {total_memory:.2f} MB")
else:
    print("Skipping data overview - files not present")
    total_memory = 0

In [None]:
# Cell 9: Setup Summary
print("=" * 50)
print("DAY 0 SETUP VALIDATION COMPLETE")
print("=" * 50)
print(f"""
✓ Python environment: {sys.version.split()[0]}
✓ GCP Project: {config['gcp']['project_id']}
✓ GCS Bucket: {config['gcp']['bucket']}
✓ Vertex AI Experiment: {config['vertex_ai']['experiment_name']}
{'✓' if all_present else '✗'} Data files: {'All 9 CSVs present' if all_present else 'MISSING - download from Kaggle'}
{'✓' if all_present else '○'} Total data size: {total_memory:.2f} MB

{'Ready for Day 1: Data Engineering & EDA' if all_present else 'Please download data before proceeding'}
""")