## Model Development & Performance

In [7]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

#### Data Loading

In [2]:
df = pd.read_csv('../data/modelingData/modelingDataFrame.csv')

We will start with classic models to evaluate their effectiveness. As a first step, we will attempt to predict the number of people without power. Initially, we will remove columns that are not relevant to our task, such as time-related data and precise location identifiers (e.g., county name and code). The remaining data will be transformed and prepared for modeling.

### XGBRegressor

In [3]:
event_names = ['Astronomical Low Tide', 'Extreme Cold/Wind Chill', 'Flood','Winter Weather', 
               'Wildfire', 'Heavy Rain', 'Cold/Wind Chill', 'Dense Fog', 'Frost/Freeze', 'Strong Wind',
               'Lake-Effect Snow', 'Funnel Cloud', 'Flash Flood', 'Heavy Snow', 'Ice Storm', 
               'Thunderstorm Wind', 'Avalanche', 'Excessive Heat', 'Coastal Flood', 'Storm Surge/Tide', 
               'Sleet', 'Debris Flow', 'Winter Storm', 'Tropical Storm', 'Dust Storm', 'Drought', 
               'Blizzard', 'Lightning', 'Tornado', 'Hail', 'Rip Current', 'Heat', 'Freezing Fog', 
               'High Surf', 'High Wind',]

In [4]:
target = 'CustomersOut'

numeric_features = ['Tmin', 'Tmax', 'Tavg', 'Ppt', 'Lat', 'Lng', 'Magnitude']
categorical_features = ['Season', 'Region', 'Division']
event_features = [col for col in df.columns if col in event_names]

X = df[numeric_features + categorical_features + event_features]
y = df[target]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features),
    ('event', 'passthrough', event_features)
])

# Pipeline with model
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    verbosity=1,
    random_state=42
)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb)
])


In [None]:
model_pipeline.fit(
    X_train, y_train,
    regressor__eval_set=[(X_val, y_val)],
    regressor__early_stopping_rounds=10,
    regressor__verbose=True
)