# Multi-city classification

We need to run a classifier that has been tested on one pointcloud, and test it on another to measure the accuracy.

What is difficult to determine, is how effective this model will be with new data.

In [None]:
import pdal
import laspy
import numpy as np
import matplotlib.pyplot as plt     
import seaborn as sns
import open3d as o3d
import pandas as pd
from dotenv import load_dotenv
import json
from upath import UPath
import os
# Load environment variables from .env file if it exists
load_dotenv()

sns.set_theme(style="whitegrid")

In [None]:
# --- Setup remote B2Drop file path ---
B2D_DIR = UPath(
    os.getenv("DATA_DIR_FSSPEC_URI"),
    base_url=os.getenv("DATA_DIR_FSSPEC_BASE_URL"),
    auth=(os.getenv("DATA_DIR_FSSPEC_USER"),
          os.getenv("DATA_DIR_FSSPEC_PASS"))
)
file_path = B2D_DIR / "bologna.laz"

# --- Read remote file as binary stream ---
with file_path.open("rb") as f:
    las_bytes = f.read()  # careful: this still streams the full file once
from pathlib import Path

# Write bytes to a temp file
temp_file = Path("../data/bologna.laz")
with open(temp_file, "wb") as f:
    f.write(las_bytes)

# Run PDAL on it
import pdal, json

pipeline_json = {
    "pipeline": [
        {"type": "readers.las", "filename": str(temp_file)},
        {"type": "filters.stats"},
        {"type": "writers.las", "filename": "../data/bologna_filtered.las"}
    ]
}

pipeline = pdal.Pipeline(json.dumps(pipeline_json))
pipeline.execute()
print("PDAL processed file from disk")


PDAL processed file from disk


In [None]:
import laspy
import pandas as pd
from pathlib import Path
import math

# --- Settings ---
input_file = las_bytes              # input LiDAR
output_dir = Path("../data/bologna_tiles/parquet") # output folder
points_per_chunk = 1_000_000                    # max points per tile

output_dir.mkdir(parents=True, exist_ok=True)

# --- Read LAS/LAZ ---
las = laspy.read(input_file)
num_points = len(las.x)
print(f"Loaded {num_points} points")

# --- Calculate number of chunks ---
num_chunks = math.ceil(num_points / points_per_chunk)
print(f"Splitting into {num_chunks} chunks of up to {points_per_chunk} points each")

# --- Process each chunk ---
for i in range(num_chunks):
    start = i * points_per_chunk
    end = min((i + 1) * points_per_chunk, num_points)

    # Slice the points
    chunk = las[start:end]

    # Convert to DataFrame
    df = pd.DataFrame({
        "x": chunk.x,
        "y": chunk.y,
        "z": chunk.z,
        "intensity": chunk.intensity,
        "return_number": chunk.return_number,
        "classification": chunk.classification
    })

    # Write to Parquet
    output_file = output_dir / f"chunk_{i:03d}.parquet"
    df.to_parquet(output_file, engine="pyarrow", index=False)

    print(f"Chunk {i+1}/{num_chunks}: wrote {len(df)} points to {output_file}")
