In [6]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import joblib
import matplotlib.pyplot as plt
import contextily as ctx
import numpy as np
import os

Load Data

In [10]:
parquet_path = "../data/outputs/combined_dataset.parquet"

columns_to_load = [
    "longitude", "latitude",
    "dNBR", "NDVI", "VCI", "SPI"
]

data_in_memory = globals().get('data', None)
if isinstance(data_in_memory, pd.DataFrame):
    print("Using existing DataFrame 'data'.")
    df = data_in_memory.loc[:, [c for c in columns_to_load if c in data_in_memory.columns]]
else:
    print("Loading dataset (single call)...")
    df = pd.read_parquet(parquet_path, columns=columns_to_load, engine="pyarrow")

if len(df) > 1_000_000:
    print("Limiting rows for visualization (first 1,000,000 rows)...")
    df = df.iloc[:1_000_000]

print(f"Loaded {len(df):,} rows.")

sample_n = min(20000, len(df))
data_sample = df.sample(n=sample_n, random_state=42)

gdf = gpd.GeoDataFrame(
    data_sample,
    geometry=[Point(xy) for xy in zip(data_sample.longitude, data_sample.latitude)],
    crs="EPSG:4326"
)

Using existing DataFrame 'data'.
Limiting rows for visualization (first 1,000,000 rows)...
Loaded 1,000,000 rows.
