# Import pacages

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# Read data and selects featuers, and splits data

In [None]:
df = pd.read_csv('/data/preproessed_collected_dataset_flood.csv')


df['Year'] = df['DATE'].apply(lambda x: x // 10000)
df = df.groupby(['LONGITUDE', 'LATITUDE', 'Year']).apply(lambda x: x.sample(frac=0.01))


dummies = pd.get_dummies(df, columns=['FLD_ZONE', 'CATEGORY', 'STATE'])



feature_names=['Elevation', 'Wind_f', 'Evap', 'Tair_f', 'Qair_f', 'Psurf_f',
               'Streamflow', 'SoilMoist100_200cm', 'SoilTemp100_200cm', 'LC_Type2',
               'FLD_ELEV', 'Qsb', 'CFLD_RISKS',
               'RFLD_RISKS', 'HRCN_RISKS', 'Rainf_f_MA30',
               'FLD_ZONE_A', 'FLD_ZONE_AE', 'FLD_ZONE_AH', 'FLD_ZONE_AO',
               'FLD_ZONE_Nan', 'FLD_ZONE_VE', 'FLD_ZONE_X',
               'FLD_ZONE_X PROTECTED BY LEVEE']

X = dummies[feature_names]

y = dummies['FloodedFrac']

# 80% Traning 10% Testing  10% Valitation Split 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=10)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=10)

# Set Parameter and train model

In [None]:
# Random Forest Parameters
params = {
    "n_estimators": 500,
    "max_features": 8,
    "max_depth": 15,
    "min_samples_split": 2,
    "warm_start":True,
    "oob_score":True,
    "random_state": 42,
    "verbose" : 1,
    "n_jobs" : -1,
}

reg = RandomForestRegressor(**params)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

reg.fit(X_train, y_train)

# Calculate MSE and RMSE

In [None]:
rf_train_pred = reg.predict(X_train_scaled)
rf_val_pred = reg.predict(X_val_scaled)
rf_test_pred = reg.predict(X_test_scaled)


rf_val_mse = mean_squared_error(y_val, rf_val_pred)
rf_test_mse = mean_squared_error(y_test, rf_test_pred)

rf_val_rmse = np.sqrt(rf_val_mse)
rf_test_rmse = np.sqrt(rf_test_mse)

print("Validation MSE:", rf_val_mse)
print("Test MSE:", rf_test_mse)

print("Validation RMSE:", rf_val_rmse)
print("Test RMSE:", rf_test_rmse)

# Create scatterplot for acutal y value vs predicted y

In [None]:
plt.scatter(y_test, rf_test_pred, alpha=0.6, color='blue', edgecolors='w')

plt.xlabel("Actual y values")
plt.ylabel("Predicted y values")
plt.title('RF - Actual vs Predicted')

# Ensure the plot axis is square
ax = plt.gca()
ax.set_aspect('equal')

# Set the limits for better visualization
lims = [
    np.min([y_test.min(), rf_test_pred.min()]),
    np.max([y_test.max(), rf_test_pred.max()])
]
plt.xlim(lims)
plt.ylim(lims)

# Plot the 45-degree line
plt.plot(lims, lims, 'k-', alpha=0.75, zorder=0)

plt.grid(True)
plt.show()
plt.clf()  # Clear the current figure window

# Importance chart

In [None]:
# obtain feature importance
feature_importance = reg.feature_importances_

sorted_idx = np.argsort(feature_importance)[::-1]
pos = np.arange(sorted_idx.shape[0])

plt.figure(figsize=(10, 8)) 

# Plot feature importances with increased spacing between bars
plt.barh(pos, feature_importance[sorted_idx], align="center")

# Increase spacing between bars by changing the step size on the y-axis
plt.yticks(pos, np.array(feature_names)[sorted_idx])
plt.title("Feature Importance (MDI)")
plt.xlabel("Mean decrease in impurity")
plt.tight_layout()
plt.show()