# Phase 2: Preprocessing
## Alaca Cesmesi Scan-to-HBIM V6 Pipeline

This notebook preprocesses the raw point cloud:
1. Voxel Downsampling (0.01m)
2. Statistical Outlier Removal
3. Normal Estimation

**Input:** `gs://alaca-cesme-hbim-v6/processed/v{N}/01_raw/`  
**Output:** `gs://alaca-cesme-hbim-v6/processed/v{N}/02_preprocessed/`

In [None]:
!pip install -q open3d google-cloud-storage numpy scipy

In [None]:
import open3d as o3d
import numpy as np
import json
import os
import time
from datetime import datetime
from google.cloud import storage
from google.colab import auth

auth.authenticate_user()

# Configuration
BUCKET_NAME = "alaca-cesme-hbim-v6"
PROJECT_ID = "concrete-racer-470219-h8"
VERSION = "v1"

# Preprocessing parameters (from v6 config)
VOXEL_SIZE = 0.01  # 1cm - fixed for heritage detail
SOR_K_NEIGHBORS = 20
SOR_STD_RATIO = 2.0
NORMAL_RADIUS = 0.03
NORMAL_MAX_NN = 30

# Paths
INPUT_PATH = f"processed/{VERSION}/01_raw/01_raw_pointcloud.ply"
OUTPUT_BASE = f"processed/{VERSION}/02_preprocessed/"

LOCAL_INPUT = "/content/input.ply"
LOCAL_OUTPUT_PLY = "/content/02_preprocessed.ply"
LOCAL_OUTPUT_JSON = "/content/02_preprocessing_stats.json"

In [None]:
# GCS functions
def download_from_gcs(bucket_name, blob_name, local_path):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.download_to_filename(local_path)
    print(f"Downloaded: {blob_name}")
    return local_path

def upload_to_gcs(bucket_name, local_path, blob_name):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(local_path)
    print(f"Uploaded: {blob_name}")
    return f"gs://{bucket_name}/{blob_name}"

# Download input
download_from_gcs(BUCKET_NAME, INPUT_PATH, LOCAL_INPUT)
pcd = o3d.io.read_point_cloud(LOCAL_INPUT)
original_count = len(pcd.points)
print(f"Loaded: {original_count:,} points")

In [None]:
start_time = time.time()

# Step 1: Voxel Downsampling
print(f"\n[Step 1] Voxel downsampling (voxel_size={VOXEL_SIZE}m)...")
pcd = pcd.voxel_down_sample(VOXEL_SIZE)
after_voxel = len(pcd.points)
print(f"  Before: {original_count:,} -> After: {after_voxel:,}")
print(f"  Reduction: {(1 - after_voxel/original_count)*100:.1f}%")

# Step 2: Statistical Outlier Removal
print(f"\n[Step 2] Statistical outlier removal (k={SOR_K_NEIGHBORS}, std={SOR_STD_RATIO})...")
pcd, _ = pcd.remove_statistical_outlier(nb_neighbors=SOR_K_NEIGHBORS, std_ratio=SOR_STD_RATIO)
after_sor = len(pcd.points)
print(f"  Removed: {after_voxel - after_sor:,} outliers")
print(f"  Remaining: {after_sor:,} points")

# Step 3: Normal Estimation
print(f"\n[Step 3] Normal estimation (radius={NORMAL_RADIUS}m, max_nn={NORMAL_MAX_NN})...")
pcd.estimate_normals(
    search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=NORMAL_RADIUS, max_nn=NORMAL_MAX_NN)
)
pcd.orient_normals_consistent_tangent_plane(k=15)
print(f"  Normals computed: {len(np.asarray(pcd.normals)):,}")

elapsed_time = time.time() - start_time
print(f"\nTotal processing time: {elapsed_time:.1f} seconds")

In [None]:
# Compute statistics
bbox = pcd.get_axis_aligned_bounding_box()
dimensions = bbox.get_max_bound() - bbox.get_min_bound()

stats = {
    "phase": "02_preprocess",
    "original_point_count": original_count,
    "preprocessed_point_count": len(pcd.points),
    "reduction_ratio": f"{(1 - len(pcd.points)/original_count)*100:.1f}%",
    "bounding_box": {
        "dimensions": dimensions.tolist(),
        "min": bbox.get_min_bound().tolist(),
        "max": bbox.get_max_bound().tolist()
    },
    "has_normals": pcd.has_normals(),
    "preprocessing_params": {
        "voxel_size": VOXEL_SIZE,
        "sor_k_neighbors": SOR_K_NEIGHBORS,
        "sor_std_ratio": SOR_STD_RATIO,
        "normal_radius": NORMAL_RADIUS,
        "normal_max_nn": NORMAL_MAX_NN
    },
    "processing_time_sec": elapsed_time,
    "timestamp": datetime.now().isoformat(),
    "pipeline_version": "v6"
}

# Save locally
o3d.io.write_point_cloud(LOCAL_OUTPUT_PLY, pcd, write_ascii=False)
with open(LOCAL_OUTPUT_JSON, 'w') as f:
    json.dump(stats, f, indent=2)
print("Saved outputs locally")

In [None]:
# Upload to GCS
upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_PLY, f"{OUTPUT_BASE}02_preprocessed.ply")
upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_JSON, f"{OUTPUT_BASE}02_preprocessing_stats.json")

# Status for n8n
status = {
    "phase": "02_preprocess",
    "status": "success",
    "version": VERSION,
    "outputs": {
        "pointcloud": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}02_preprocessed.ply",
        "stats": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}02_preprocessing_stats.json"
    },
    "metrics": {
        "original_points": original_count,
        "final_points": len(pcd.points),
        "reduction": stats["reduction_ratio"],
        "processing_time": f"{elapsed_time:.1f}s"
    },
    "timestamp": datetime.now().isoformat(),
    "next_phase": "03_features"
}

print("\n" + "="*60)
print("PHASE 2 COMPLETE")
print("="*60)
print(json.dumps(status, indent=2))