# Phase 1: Load & Validate Point Cloud
## Alaca Cesmesi Scan-to-HBIM V6 Pipeline

This notebook loads the raw point cloud from GCS and performs initial validation.

**Input:** `gs://alaca-cesme-hbim-v6/raw/pointcloud/*.ply`  
**Output:** `gs://alaca-cesme-hbim-v6/processed/v{N}/01_raw/`

## 1. Setup & Dependencies

In [None]:
# Install required packages
!pip install -q open3d google-cloud-storage numpy

In [None]:
# Imports
import open3d as o3d
import numpy as np
import json
import os
from datetime import datetime
from google.cloud import storage
from google.colab import auth

print(f"Open3D version: {o3d.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. GCS Configuration

In [None]:
# Authenticate with Google Cloud
auth.authenticate_user()

# Configuration
BUCKET_NAME = "alaca-cesme-hbim-v6"
PROJECT_ID = "concrete-racer-470219-h8"
VERSION = "v1"  # Change for each run

# Paths
RAW_INPUT_PATH = "raw/pointcloud/alaca_cesme_raw.ply"
OUTPUT_BASE = f"processed/{VERSION}/01_raw/"

# Local temp paths
LOCAL_INPUT = "/content/input.ply"
LOCAL_OUTPUT_PLY = "/content/01_raw_pointcloud.ply"
LOCAL_OUTPUT_JSON = "/content/01_raw_stats.json"

print(f"Bucket: {BUCKET_NAME}")
print(f"Version: {VERSION}")
print(f"Input: {RAW_INPUT_PATH}")
print(f"Output: {OUTPUT_BASE}")

## 3. Download from GCS

In [None]:
def download_from_gcs(bucket_name, blob_name, local_path):
    """Download file from GCS bucket."""
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    print(f"Downloading: gs://{bucket_name}/{blob_name}")
    blob.download_to_filename(local_path)
    size_mb = os.path.getsize(local_path) / (1024 * 1024)
    print(f"Downloaded: {local_path} ({size_mb:.1f} MB)")
    return local_path

# Download raw point cloud
download_from_gcs(BUCKET_NAME, RAW_INPUT_PATH, LOCAL_INPUT)

## 4. Load Point Cloud

In [None]:
def load_point_cloud(file_path):
    """
    Load a point cloud from file.
    
    Open3D's read_point_cloud() automatically detects file format.
    For large files (>100MB), binary formats are significantly faster.
    """
    print(f"Loading point cloud: {file_path}")
    pcd = o3d.io.read_point_cloud(file_path)
    print(f"Points loaded: {len(pcd.points):,}")
    return pcd

# Load
pcd = load_point_cloud(LOCAL_INPUT)
print(f"\nPoint cloud loaded with {len(pcd.points):,} points")

## 5. Validate Point Cloud

In [None]:
def validate_point_cloud(pcd, input_filename):
    """
    Validate and analyze point cloud properties.
    
    Key properties to check:
        1. Point count: Should be > 0
        2. Colors: RGB values (0-1 in Open3D)
        3. Normals: Unit vectors for surface orientation
        4. Bounding box: Spatial extent of data
    """
    points = np.asarray(pcd.points)
    
    if len(points) == 0:
        raise ValueError("Point cloud is empty!")
    
    # Compute bounding box
    bbox = pcd.get_axis_aligned_bounding_box()
    min_bound = bbox.get_min_bound()
    max_bound = bbox.get_max_bound()
    dimensions = max_bound - min_bound
    center = bbox.get_center()
    
    # Check attributes
    has_colors = pcd.has_colors()
    has_normals = pcd.has_normals()
    
    stats = {
        "file_name": input_filename,
        "point_count": len(points),
        "point_count_formatted": f"{len(points):,}",
        "bounding_box": {
            "min": min_bound.tolist(),
            "max": max_bound.tolist(),
            "dimensions": dimensions.tolist(),
            "center": center.tolist()
        },
        "dimensions_formatted": {
            "width_x": f"{dimensions[0]:.3f}m",
            "depth_y": f"{dimensions[1]:.3f}m",
            "height_z": f"{dimensions[2]:.3f}m"
        },
        "has_colors": has_colors,
        "has_normals": has_normals,
        "timestamp": datetime.now().isoformat(),
        "pipeline_version": "v6",
        "phase": "01_load"
    }
    
    return stats

# Validate
stats = validate_point_cloud(pcd, os.path.basename(RAW_INPUT_PATH))

# Print report
print("\n" + "="*60)
print("POINT CLOUD VALIDATION REPORT")
print("="*60)
print(f"\nFile: {stats['file_name']}")
print(f"Points: {stats['point_count_formatted']}")
print(f"\nBounding Box:")
print(f"  Width (X):  {stats['dimensions_formatted']['width_x']}")
print(f"  Depth (Y):  {stats['dimensions_formatted']['depth_y']}")
print(f"  Height (Z): {stats['dimensions_formatted']['height_z']}")
print(f"\nAttributes:")
print(f"  Has Colors:  {'Yes' if stats['has_colors'] else 'No'}")
print(f"  Has Normals: {'Yes' if stats['has_normals'] else 'No (will compute)'}")
print("="*60)

## 6. Save Outputs Locally

In [None]:
# Save point cloud
o3d.io.write_point_cloud(LOCAL_OUTPUT_PLY, pcd, write_ascii=False, compressed=False)
print(f"Saved: {LOCAL_OUTPUT_PLY}")

# Save statistics
with open(LOCAL_OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"Saved: {LOCAL_OUTPUT_JSON}")

## 7. Upload to GCS

In [None]:
def upload_to_gcs(bucket_name, local_path, blob_name):
    """Upload file to GCS bucket."""
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    print(f"Uploading: {local_path} -> gs://{bucket_name}/{blob_name}")
    blob.upload_from_filename(local_path)
    size_mb = os.path.getsize(local_path) / (1024 * 1024)
    print(f"Uploaded: {size_mb:.1f} MB")
    return f"gs://{bucket_name}/{blob_name}"

# Upload outputs
ply_gcs_path = upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_PLY, f"{OUTPUT_BASE}01_raw_pointcloud.ply")
json_gcs_path = upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_JSON, f"{OUTPUT_BASE}01_raw_stats.json")

print(f"\nOutputs uploaded to GCS:")
print(f"  PLY:  {ply_gcs_path}")
print(f"  JSON: {json_gcs_path}")

## 8. Return Status (for n8n)

In [None]:
# Prepare status response for n8n webhook
status = {
    "phase": "01_load",
    "status": "success",
    "version": VERSION,
    "outputs": {
        "pointcloud": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}01_raw_pointcloud.ply",
        "stats": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}01_raw_stats.json"
    },
    "metrics": {
        "point_count": stats["point_count"],
        "has_colors": stats["has_colors"],
        "has_normals": stats["has_normals"],
        "dimensions": stats["dimensions_formatted"]
    },
    "timestamp": datetime.now().isoformat(),
    "next_phase": "02_preprocess"
}

print("\n" + "="*60)
print("PHASE 1 COMPLETE")
print("="*60)
print(json.dumps(status, indent=2))