# Phase 4: Segmentation (DBSCAN)
## Alaca Cesmesi Scan-to-HBIM V6 Pipeline

Segments the point cloud using DBSCAN clustering.

**Input:** `gs://alaca-cesme-hbim-v6/processed/v{N}/02_preprocessed/`  
**Output:** `gs://alaca-cesme-hbim-v6/processed/v{N}/04_segmentation/`

In [None]:
!pip install -q open3d google-cloud-storage numpy scipy scikit-learn

In [None]:
import open3d as o3d
import numpy as np
from sklearn.cluster import DBSCAN
import json
import os
import time
from datetime import datetime
from google.cloud import storage
from google.colab import auth

auth.authenticate_user()

# Configuration
BUCKET_NAME = "alaca-cesme-hbim-v6"
PROJECT_ID = "concrete-racer-470219-h8"
VERSION = "v1"

# DBSCAN parameters
DBSCAN_EPS = 0.05  # 5cm neighborhood radius
DBSCAN_MIN_SAMPLES = 50
MIN_SEGMENT_POINTS = 100  # Remove tiny segments

# Paths
INPUT_PLY_PATH = f"processed/{VERSION}/02_preprocessed/02_preprocessed.ply"
OUTPUT_BASE = f"processed/{VERSION}/04_segmentation/"

LOCAL_INPUT = "/content/input.ply"
LOCAL_OUTPUT_PLY = "/content/04_segmented.ply"
LOCAL_OUTPUT_LABELS = "/content/04_segment_labels.npy"
LOCAL_OUTPUT_JSON = "/content/04_segments_metadata.json"

In [None]:
# GCS functions
def download_from_gcs(bucket_name, blob_name, local_path):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.download_to_filename(local_path)
    print(f"Downloaded: {blob_name}")
    return local_path

def upload_to_gcs(bucket_name, local_path, blob_name):
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(local_path)
    print(f"Uploaded: {blob_name}")
    return f"gs://{bucket_name}/{blob_name}"

# Download input
download_from_gcs(BUCKET_NAME, INPUT_PLY_PATH, LOCAL_INPUT)
pcd = o3d.io.read_point_cloud(LOCAL_INPUT)
points = np.asarray(pcd.points)
n_points = len(points)
print(f"Loaded: {n_points:,} points")

In [None]:
start_time = time.time()

# Run DBSCAN
print(f"\nRunning DBSCAN (eps={DBSCAN_EPS}, min_samples={DBSCAN_MIN_SAMPLES})...")
clustering = DBSCAN(eps=DBSCAN_EPS, min_samples=DBSCAN_MIN_SAMPLES, n_jobs=-1)
labels = clustering.fit_predict(points)

# Statistics
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = np.sum(labels == -1)

print(f"\nDBSCAN Results:")
print(f"  Clusters found: {n_clusters}")
print(f"  Noise points: {n_noise:,} ({100*n_noise/n_points:.1f}%)")

# Remove small segments
unique_labels, counts = np.unique(labels, return_counts=True)
small_segments = unique_labels[(counts < MIN_SEGMENT_POINTS) & (unique_labels != -1)]
for seg in small_segments:
    labels[labels == seg] = -1
    
n_clusters_final = len(set(labels)) - (1 if -1 in labels else 0)
print(f"  After filtering: {n_clusters_final} segments (removed {n_clusters - n_clusters_final} small ones)")

elapsed_time = time.time() - start_time
print(f"\nProcessing time: {elapsed_time:.1f} seconds")

In [None]:
# Color segments for visualization
max_label = labels.max()
colors = np.zeros((n_points, 3))
np.random.seed(42)

for label in range(max_label + 1):
    mask = labels == label
    colors[mask] = np.random.rand(3)
    
# Noise points = gray
colors[labels == -1] = [0.5, 0.5, 0.5]

pcd.colors = o3d.utility.Vector3dVector(colors)
print(f"Colored {max_label + 1} segments + noise")

In [None]:
# Compute segment statistics
segment_info = []
for label in range(max_label + 1):
    mask = labels == label
    segment_points = points[mask]
    
    if len(segment_points) > 0:
        bbox_min = segment_points.min(axis=0)
        bbox_max = segment_points.max(axis=0)
        
        segment_info.append({
            "id": int(label),
            "point_count": int(np.sum(mask)),
            "bbox_min": bbox_min.tolist(),
            "bbox_max": bbox_max.tolist(),
            "centroid": segment_points.mean(axis=0).tolist()
        })

stats = {
    "phase": "04_segmentation",
    "n_points": n_points,
    "n_segments": n_clusters_final,
    "n_noise_points": int(np.sum(labels == -1)),
    "dbscan_params": {
        "eps": DBSCAN_EPS,
        "min_samples": DBSCAN_MIN_SAMPLES,
        "min_segment_points": MIN_SEGMENT_POINTS
    },
    "segments": segment_info,
    "processing_time_sec": elapsed_time,
    "timestamp": datetime.now().isoformat(),
    "pipeline_version": "v6"
}

# Save locally
o3d.io.write_point_cloud(LOCAL_OUTPUT_PLY, pcd)
np.save(LOCAL_OUTPUT_LABELS, labels)
with open(LOCAL_OUTPUT_JSON, 'w') as f:
    json.dump(stats, f, indent=2)
print("Saved outputs locally")

In [None]:
# Upload to GCS
upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_PLY, f"{OUTPUT_BASE}04_segmented.ply")
upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_LABELS, f"{OUTPUT_BASE}04_segment_labels.npy")
upload_to_gcs(BUCKET_NAME, LOCAL_OUTPUT_JSON, f"{OUTPUT_BASE}04_segments_metadata.json")

# Status for n8n
status = {
    "phase": "04_segment",
    "status": "success",
    "version": VERSION,
    "outputs": {
        "segmented_ply": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}04_segmented.ply",
        "labels": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}04_segment_labels.npy",
        "metadata": f"gs://{BUCKET_NAME}/{OUTPUT_BASE}04_segments_metadata.json"
    },
    "metrics": {
        "n_segments": n_clusters_final,
        "n_noise_points": int(np.sum(labels == -1)),
        "processing_time": f"{elapsed_time:.1f}s"
    },
    "timestamp": datetime.now().isoformat(),
    "next_phase": "05_classify"
}

print("\n" + "="*60)
print("PHASE 4 COMPLETE")
print("="*60)
print(json.dumps(status, indent=2))