In [None]:
import Data_tools as dt
from config_paths import DATA, OUTPUT, INTERMEDIATE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.linear_model import HuberRegressor

In [None]:
parquet_path = OUTPUT/"Global_22_07_25/parquet/Only_swot_slope/slopes_swot_6229.parquet"
shp_path = DATA/"External/SWORD/shp_v16/sa_sword_nodes_hb62_v16.shp"
out_path = OUTPUT/"Global_22_07_25/parquet/slopes_SWORD_6229.parquet"

In [None]:
dt.merge_slope_to_nodes(parquet_path, shp_path, out_path)

In [None]:

df = pd.read_parquet(OUTPUT/"Global_22_07_25/parquet/slopes_SWORD_6229.parquet")
df

In [None]:
# =========================
# 1) Prep & filtering
# =========================
d = df[(df['dist_out'] > 0) & (df['slope1'] > 0)].copy()
d['dist_km'] = d['dist_out'] / 1000.0

x = d['dist_km'].to_numpy()
y = d['slope1'].to_numpy()

# =========================
# 2) Helpers
# =========================
def nanpercentile_safe(a, q):
    a = pd.to_numeric(pd.Series(a), errors='coerce').dropna().to_numpy()
    if a.size == 0:
        return np.nan
    return np.percentile(a, q)

# =========================
# 3) Binning along distance
#    (equal-width by default, switch to equal-count if wanted)
# =========================
nbins = 160                # tune based on your x-range & sample size
use_equal_count = False    # True -> quantile bins (reduces empty bins)
min_n = 15                 # minimum observations per bin to keep

if use_equal_count:
    # Rank to avoid qcut duplicate-bin issues when x has ties
    d['bin'] = pd.qcut(d['dist_km'].rank(method='first'), q=nbins, duplicates='drop')
else:
    d['bin'] = pd.cut(d['dist_km'], bins=nbins, include_lowest=True)

g = d.groupby('bin', observed=True)

agg = (
    g.agg(
        n=('slope1', 'size'),
        dist_mid=('dist_km', 'median'),                        # x for the bin
        slope_med=('slope1', 'median'),                        # median (linear space)
        slope_low=('slope1', lambda z: nanpercentile_safe(z, 10)),
        slope_high=('slope1', lambda z: nanpercentile_safe(z, 90)),
    )
    .reset_index(drop=True)
)

# Keep sufficiently populated, finite bins and sort by x
agg = agg[(agg['n'] >= min_n) &
          agg[['dist_mid','slope_med','slope_low','slope_high']].replace([np.inf,-np.inf], np.nan).notna().all(axis=1)]
agg = agg.sort_values('dist_mid')

# =========================
# 4) Semi-log regression: log10(y) ~ x  (Huber robust)
# =========================
mask_fit = np.isfinite(x) & np.isfinite(y) & (y > 0)
x_fit_in = x[mask_fit].reshape(-1,1)
logy_in = np.log10(y[mask_fit])

model = HuberRegressor()
model.fit(x_fit_in, logy_in)

x_fit = np.linspace(np.nanmin(x), np.nanmax(x), 200)
logy_pred = model.predict(x_fit.reshape(-1,1))
y_fit_line = 10**logy_pred

# =========================
# 5) Background density scatter (optional)
# =========================
mask_scatter = np.isfinite(x) & np.isfinite(y)
xy = np.vstack([x[mask_scatter], y[mask_scatter]])
if xy.shape[1] >= 5:
    z = gaussian_kde(xy)(xy)
    idx = z.argsort()
    xb, yb, zb = x[mask_scatter][idx], y[mask_scatter][idx], z[idx]
else:
    xb, yb, zb = x[mask_scatter], y[mask_scatter], np.ones(mask_scatter.sum())

# =========================
# 6) Plot
# =========================
plt.figure(figsize=(9,7))

# Faint raw points
plt.scatter(xb, yb, c=zb, cmap='viridis', s=8, alpha=0.15, label='_nolegend_')

# Bin CI band (10–90%)
plt.fill_between(
    agg['dist_mid'].to_numpy(),
    agg['slope_low'].to_numpy(),
    agg['slope_high'].to_numpy(),
    alpha=0.18, step='mid', label='10–90% spread'
)

# Bin medians as points
plt.scatter(agg['dist_mid'], agg['slope_med'],
            s=46, edgecolor='k', lw=0.4, alpha=0.95,
            label=f"Binned median (n≥{min_n})")

# === NEW: line joining the binned medians
plt.plot(agg['dist_mid'], agg['slope_med'],
         color='black', linewidth=1.6, alpha=0.9,
         zorder=3, label="Binned median trend")

# Regression curve
plt.plot(x_fit, y_fit_line, linewidth=2.2, color='white',
         label="Semi-log Huber fit")

# Axes + scales
plt.yscale("log")
plt.xlabel("Distance to outlet (km)")
plt.ylabel("Bank slope")
#plt.xlim(0,1600)

# === NEW: reverse x-axis (so 0 km is at the right, headwaters on the left if that's your convention)
plt.gca().invert_xaxis()

plt.grid(True, which="both", linestyle="--", linewidth=0.35)
plt.title("Bank Slope vs Distance (binned medians + semi-log Huber fit)")
plt.legend()
plt.tight_layout()
plt.show()

# =========================
# 7) Print model summary
# =========================
a_log10 = model.intercept_
b = model.coef_[0]
a = 10**a_log10
print(f"log10(a) = {a_log10:.3f}")
print(f"b = {b:.3f}")
print(f"Trend: Slope ≈ {a:.3g} · 10^({b:.3f}·Distance_km)")



In [None]:
d = df[(df['slope1'] > 0) & (df['facc'] > 0)].copy()
d['dist_km'] = d['dist_out'] / 1000.0

x = d['facc'].values
y = d['slope1'].values

# === Robust power-law regression in log-log space ===
logx = np.log10(x)
logy = np.log10(y)

model = HuberRegressor()
model.fit(logx.reshape(-1,1), logy)

# Create smooth trend line
logx_fit = np.linspace(logx.min(), logx.max(), 200)
logy_pred = model.predict(logx_fit.reshape(-1,1))

x_fit = 10**logx_fit
y_fit = 10**logy_pred

# === KDE Density ===
xy = np.vstack([x, y])
z = gaussian_kde(xy)(xy)
idx = z.argsort()

x, y, z = x[idx], y[idx], z[idx]

# === Plot ===
plt.figure(figsize=(8,7))
sc = plt.scatter(x, y, c=z, cmap='viridis', s=10, alpha=0.65)

# Power-law trend overlay
plt.plot(x_fit, y_fit, color='white', linewidth=2.3,
         label=f"Power-law fit: slope ≈ {10**model.intercept_:.2e} · dist^{model.coef_[0]:.2f}")

plt.xscale("log")
plt.yscale("log")

plt.grid(True, which="both", linestyle="--", linewidth=0.35)
plt.colorbar(sc, label="Density (KDE)")

plt.xlabel("Flow accumulation (log scale)")
plt.ylabel("Bank slope (log scale)")
plt.title("Bank Slope vs Flow Accumulation")

plt.legend()
plt.tight_layout()
plt.show()

print("log10(a) =", model.intercept_)
print("b =", model.coef_[0])
print(f"Trend equation:\nSlope ≈ {10**model.intercept_:.3g} · Distance^{model.coef_[0]:.3f}")


In [None]:
# =========================
# 1) Prep & filtering
# =========================
d = df[(df['slope1'] > 0) & (df['facc'] > 0)].copy()

x = d['facc'].to_numpy()
y = d['slope1'].to_numpy()

# =========================
# 2) Helpers
# =========================
def nanpercentile_safe(a, q):
    a = pd.to_numeric(pd.Series(a), errors='coerce').dropna().to_numpy()
    if a.size == 0:
        return np.nan
    return np.percentile(a, q)

# =========================
# 3) Binning along x (facc)
# =========================
nbins = 160
use_equal_count = False
min_n = 15

if use_equal_count:
    d['bin'] = pd.qcut(d['facc'].rank(method='first'), q=nbins, duplicates='drop')
else:
    d['bin'] = pd.cut(d['facc'], bins=nbins, include_lowest=True)

g = d.groupby('bin', observed=True)

agg = (
    g.agg(
        n=('facc', 'size'),
        x_mid=('facc', 'median'),
        slope_med=('slope1', 'median'),
        slope_low=('slope1', lambda z: nanpercentile_safe(z, 10)),
        slope_high=('slope1', lambda z: nanpercentile_safe(z, 90)),
    )
    .reset_index(drop=True)
)

agg = agg[
    (agg['n'] >= min_n) &
    agg[['x_mid','slope_med','slope_low','slope_high']]
      .replace([np.inf,-np.inf], np.nan)
      .notna()
      .all(axis=1)
].sort_values('x_mid')

# =========================
# 4) LOG-LOG regression: log10(y) ~ log10(x)
# =========================
mask_fit = np.isfinite(x) & np.isfinite(y) & (x > 0) & (y > 0)
logx_in = np.log10(x[mask_fit])
logy_in = np.log10(y[mask_fit])

model = HuberRegressor()
model.fit(logx_in.reshape(-1,1), logy_in)

logx_fit = np.linspace(logx_in.min(), logx_in.max(), 200)
logy_pred = model.predict(logx_fit.reshape(-1,1))

x_fit = 10**logx_fit
y_fit_line = 10**logy_pred

# =========================
# 5) Background density scatter
# =========================
mask_scatter = np.isfinite(x) & np.isfinite(y)
xy = np.vstack([np.log10(x[mask_scatter]), np.log10(y[mask_scatter])])
if xy.shape[1] >= 5:
    z = gaussian_kde(xy)(xy)
    idx = z.argsort()
    xb, yb, zb = x[mask_scatter][idx], y[mask_scatter][idx], z[idx]
else:
    xb, yb, zb = x[mask_scatter], y[mask_scatter], np.ones(mask_scatter.sum())

# =========================
# 6) Plot
# =========================
plt.figure(figsize=(9, 7))

plt.scatter(xb, yb, c=zb, cmap='viridis', s=8, alpha=0.15)

# CI band + median line
plt.fill_between(agg['x_mid'], agg['slope_low'], agg['slope_high'],
                 alpha=0.18, step='mid')
plt.scatter(agg['x_mid'], agg['slope_med'],
            s=46, edgecolor='k', lw=0.4, alpha=0.95,
            label=f"Binned median (n≥{min_n})")
plt.plot(agg['x_mid'], agg['slope_med'], 'k-', lw=1.6, alpha=0.9)

# Regression curve
plt.plot(x_fit, y_fit_line, 'w-', lw=2.2,
         label="Log–log Huber fit")

# === BOTH AXES LOG SCALE ===
plt.xscale("log")
plt.yscale("log")

plt.xlabel("Flow accumulation (log scale)")
plt.ylabel("Bank slope (log scale)")
plt.grid(True, which="both", linestyle="--", linewidth=0.35)
plt.title("Bank Slope vs Flow Accumulation (log–log binned medians + Huber fit)")
plt.legend()
plt.tight_layout()
plt.show()

# =========================
# 7) Print model summary (power-law form)
# =========================
a_log10 = model.intercept_
b = model.coef_[0]
a = 10**a_log10
print(f"Power law: Slope ≈ {a:.3g} × (Facc)^{b:.3f}")

