### **REGRESSION MODELS**
by J. Daniel Velez

Here different regression models are tested

In [None]:
from f_filter_process import get_file, call_file, export_dataframe
from regression_models import*
import pandas as pd
import geopandas as gpd

In [None]:
# Function to import files
River_dict = call_file() 

##### Linear Regression
Args for the 'l_regression' function:
 * river (dict): Dictionary containing node data with 'width' and 'wse' keys.
 * min_spearman (float or None): Minimum Spearman correlation value to include a node in the plot.
                                 if None, no filtering is applied (default: None).
 * min_obs (int): Minimum number of observations required to display a scatter plot for a node (default: 10).
 * show_p_value (bool): If True, displays the p-value on each scatter plot (default: True).

In [None]:
################## 1. Linear Regression ##################
linear_reg = l_regression(River_dict,min_spearman=0.4, min_obs=10, show_p_value=True, min_p_value=0.05)

In [None]:
linear_reg.rename(columns={'Node': 'node_id','Spearman': 'spearman_corr'}, inplace=True)
linear_reg

In [None]:
export_dataframe(linear_reg,is_geodataframe=False)

In [None]:
river_gpkg = call_file()

In [None]:
## Define the common column used for merging, e.g., "id" (replace "id" with your actual column name)
common_column = "node_id"
#Merge the geodataframe with the dataframe
l_regression = river_gpkg.merge(linear_reg, on=common_column).T.drop_duplicates().T # Drop duplicate columns

In [None]:
#l_regression['geometry'] = gpd.GeoSeries.from_wkt(l_regression['geometry'])
l_regression = gpd.GeoDataFrame(l_regression, geometry='geometry', crs="EPSG:4326")
# Ensure Slope and Intercept are real numbers and round them to 3 decimals
l_regression['Slope'] = pd.to_numeric(l_regression['Slope'], errors='coerce').round(3).map(lambda x: f"{x:.3f}")
l_regression['Intercept'] = pd.to_numeric(l_regression['Intercept'], errors='coerce').round(3).map(lambda x: f"{x:.3f}")

In [None]:
export_dataframe(l_regression,is_geodataframe=True)

In [None]:
l_regression

### Huber regression:

Given that some of the nodes the residuals of the observations are not normally distribute, a robust model should be use. Here the epsilon parameter is fixed to 1.345 because is the canonical default in robust statistics for the Huber loss, and it's widely accepted as the best trade-off between:
* Efficiency under normal errors (≈ 95% of OLS efficiency), and
* Robustness to outliers (by limiting the influence of large residuals).

In [None]:
################## 2. Piecewise and Simple Huber Linear Regression ##################
dfe = fixed_huber_piecewise_aic(River_dict)

In [None]:
export_dataframe(dfe,is_geodataframe=False)

In [None]:
river_gpkg = call_file()

In [None]:
## Define the common column used for merging, e.g., "id" (replace "id" with your actual column name)
common_column = "node_id"
#Merge the geodataframe with the dataframe
Huber_regression = river_gpkg.merge(dfe, on=common_column) # Drop duplicate columns

In [None]:
export_dataframe(Huber_regression,is_geodataframe=True)

Plot by node

In [None]:
#Linear regression for a single node
l_regression_node(River_dict, node_id='61204500050501', min_spearman=None, min_obs=0, show_p_value=True, min_p_value=0.05)

In [None]:
#Piecewise and simple Huber regression for a single node (based on AIC)
plot_node_reg_2segs_fixed(Huber_regression, node_id='81247100020271')