In [88]:
########################################
# STEP 0: import libraries
########################################
import pandas as pd
import sklearn.datasets
import sklearn.decomposition
import sklearn.discriminant_analysis
import sklearn.ensemble
import sklearn.linear_model
import sklearn.neural_network
import sklearn.model_selection
import sklearn.naive_bayes
import sklearn.neighbors
import sklearn.preprocessing
import sklearn.random_projection
import sklearn.tree
import sklearn.svm
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier
import numpy as np
import xgboost as xgb

In [67]:
import matplotlib.pyplot as plt
from pathlib import Path

data_path = Path('./') / 'data'
train = pd.read_csv(data_path / "train.csv")
client = pd.read_csv(data_path / "client.csv")
historical_weather = pd.read_csv(data_path / "historical_weather.csv")
forecast_weather = pd.read_csv(data_path / "forecast_weather.csv")
electricity = pd.read_csv(data_path / "electricity_prices.csv")
gas = pd.read_csv(data_path / "gas_prices.csv")

location = (pd.read_csv(data_path / "county_lon_lats.csv")
            .drop(columns=["Unnamed: 0"])
           )


In [None]:
class FeatureProcessorClass():
    def __init__(self):         
        # Columns to join on for the different datasets
        self.weather_join = ['datetime', 'county', 'data_block_id']
        self.gas_join = ['data_block_id']
        self.electricity_join = ['datetime', 'data_block_id']
        self.client_join = ['county', 'is_business', 'product_type', 'data_block_id']
        
        # Columns of latitude & longitude
        self.lat_lon_columns = ['latitude', 'longitude']
        
        # Aggregate stats 
        self.agg_stats = ['mean'] #, 'min', 'max', 'std', 'median']
        
        # Categorical columns (specify for XGBoost)
        self.category_columns = ['county', 'is_business', 'product_type', 'is_consumption', 'data_block_id']

    def create_new_column_names(self, cons, suffix, columns_no_change):
        '''Change column names by given suffix, keep columns_no_change, and return back the data'''
        cons.columns = [col + suffix 
                      if col not in columns_no_change
                      else col
                      for col in cons.columns
                      ]
        return cons 

    def flatten_multi_index_columns(self, cons):
        cons.columns = ['_'.join([col for col in multi_col if len(col)>0]) 
                      for multi_col in cons.columns]
        return cons
    
    def create_data_features(self, data):
        '''📊Create features for main data (test or train) set📊'''
        # To datetime
        data['datetime'] = pd.to_datetime(data['datetime'])
        
        # Time period features
        data['date'] = data['datetime'].dt.normalize()
        data['year'] = data['datetime'].dt.year
        data['quarter'] = data['datetime'].dt.quarter
        data['month'] = data['datetime'].dt.month
        data['week'] = data['datetime'].dt.isocalendar().week
        data['hour'] = data['datetime'].dt.hour
        
        # Day features
        data['day_of_year'] = data['datetime'].dt.day_of_year
        data['day_of_month']  = data['datetime'].dt.day
        data['day_of_week'] = data['datetime'].dt.day_of_week
        return data

    def create_client_features(self, client):
        '''💼 Create client features 💼'''
        # Modify column names - specify suffix
        client = self.create_new_column_names(client, 
                                           suffix='_client',
                                           columns_no_change = self.client_join
                                          )       
        return client
    
    def create_historical_weather_features(self, historical_weather):
        '''⌛🌤️ Create historical weather features 🌤️⌛'''
        
        # To datetime
        historical_weather['datetime'] = pd.to_datetime(historical_weather['datetime'])
        
        # Add county
        historical_weather[self.lat_lon_columns] = historical_weather[self.lat_lon_columns].astype(float).round(1)
        historical_weather = historical_weather.merge(location, how = 'left', on = self.lat_lon_columns)

        # Modify column names - specify suffix
        historical_weather = self.create_new_column_names(historical_weather,
                                                          suffix='_h',
                                                          columns_no_change = self.lat_lon_columns + self.weather_join
                                                          ) 
        
        # Group by & calculate aggregate stats 
        agg_columns = [col for col in historical_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        historical_weather = historical_weather.groupby(self.weather_join).agg(agg_dict).reset_index() 
        
        # Flatten the multi column aggregates
        historical_weather = self.flatten_multi_index_columns(historical_weather) 
        
        # Test set has 1 day offset for hour<11 and 2 day offset for hour>11
        historical_weather['hour_h'] = historical_weather['datetime'].dt.hour
        historical_weather['datetime'] = (historical_weather
                                               .apply(lambda x: 
                                                      x['datetime'] + pd.DateOffset(1) 
                                                      if x['hour_h']< 11 
                                                      else x['datetime'] + pd.DateOffset(2),
                                                      axis=1)
                                              )
        
        return historical_weather
    
    def create_forecast_weather_features(self, forecast_weather):
        '''🔮🌤️ Create forecast weather features 🌤️🔮'''
        
        # Rename column and drop
        forecast_weather = (forecast_weather
                            .rename(columns = {'forecast_datetime': 'datetime'})
                            .drop(columns = 'origin_datetime') # not needed
                           )
        
        # To datetime
        forecast_weather['datetime'] = (pd.to_datetime(forecast_weather['datetime'])
                                        .dt
                                        .tz_localize(None)
                                       )

        # Add county
        forecast_weather[self.lat_lon_columns] = forecast_weather[self.lat_lon_columns].astype(float).round(1)
        forecast_weather = forecast_weather.merge(location, how = 'left', on = self.lat_lon_columns)
        
        # Modify column names - specify suffix
        forecast_weather = self.create_new_column_names(forecast_weather,
                                                        suffix='_f',
                                                        columns_no_change = self.lat_lon_columns + self.weather_join
                                                        ) 
        
        # Group by & calculate aggregate stats 
        agg_columns = [col for col in forecast_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        forecast_weather = forecast_weather.groupby(self.weather_join).agg(agg_dict).reset_index() 
        
        # Flatten the multi column aggregates
        forecast_weather = self.flatten_multi_index_columns(forecast_weather)     
        return forecast_weather

    def create_electricity_features(self, electricity):
        '''⚡ Create electricity prices features ⚡'''
        # To datetime
        electricity['forecast_date'] = pd.to_datetime(electricity['forecast_date'])
        
        # Test set has 1 day offset
        electricity['datetime'] = electricity['forecast_date'] + pd.DateOffset(1)
        
        # Modify column names - specify suffix
        electricity = self.create_new_column_names(electricity, 
                                                   suffix='_electricity',
                                                   columns_no_change = self.electricity_join
                                                  )             
        return electricity

    def create_gas_features(self, gas):
        '''⛽ Create gas prices features ⛽'''
        # Mean gas price
        gas['mean_price_per_mwh'] = (gas['lowest_price_per_mwh'] + gas['highest_price_per_mwh'])/2
        
        # Modify column names - specify suffix
        gas = self.create_new_column_names(gas, 
                                           suffix='_gas',
                                           columns_no_change = self.gas_join
                                          )       
        return gas
    
    def __call__(self, data, client, historical_weather, forecast_weather, electricity, gas):
        '''Processing of features from all datasets, merge together and return features for dataframe cons '''
        # Create features for relevant dataset
        data = self.create_data_features(data)
        client = self.create_client_features(client)
        historical_weather = self.create_historical_weather_features(historical_weather)
        forecast_weather = self.create_forecast_weather_features(forecast_weather)
        electricity = self.create_electricity_features(electricity)
        gas = self.create_gas_features(gas)
        
        # 🔗 Merge all datasets into one cons 🔗
        cons = data.merge(client, how='left', on = self.client_join)
        cons = cons.merge(historical_weather, how='left', on = self.weather_join)
        cons = cons.merge(forecast_weather, how='left', on = self.weather_join)
        cons = cons.merge(electricity, how='left', on = self.electricity_join)
        cons = cons.merge(gas, how='left', on = self.gas_join)
        
        # Change columns to categorical for XGBoost
        cons[self.category_columns] = cons[self.category_columns].astype('category')
        return cons
    
def create_revealed_targets_train(data, N_day_lags):
    '''🎯 Create past revealed_targets for train set based on number of day lags N_day_lags 🎯 '''    
    original_datetime = data['datetime']
    revealed_targets = data[['datetime', 'prediction_unit_id', 'is_consumption', 'target']].copy()
    
    # Create revealed targets for all day lags
    for day_lag in range(2, N_day_lags+1):
        revealed_targets['datetime'] = original_datetime + pd.DateOffset(day_lag)
        data = data.merge(revealed_targets, 
                          how='left', 
                          on = ['datetime', 'prediction_unit_id', 'is_consumption'],
                          suffixes = ('', f'_{day_lag}_days_ago')
                         )
    return data

In [None]:
# Generate features for training data
# NOTE: this can take a little bit to run
FeatureProcessor = FeatureProcessorClass()
N_day_lags = 15  # incorporate data from historical 15 days


data = FeatureProcessor(data = train.copy(),
                      client = client.copy(),
                      historical_weather = historical_weather.copy(),
                      forecast_weather = forecast_weather.copy(),
                      electricity = electricity.copy(),
                      gas = gas.copy(),
                     )

cons = create_revealed_targets_train(data.copy(), 
                                  N_day_lags = N_day_lags)

In [None]:
# Save features to CSV
cons.to_csv('train_features.csv', index = False)

In [None]:
cons.shape

(2018352, 71)

In [None]:
cons.columns

Index(['county', 'is_business', 'product_type', 'target', 'is_consumption',
       'datetime', 'data_block_id', 'row_id', 'prediction_unit_id', 'date',
       'year', 'quarter', 'month', 'week', 'hour', 'day_of_year',
       'day_of_month', 'day_of_week', 'eic_count_client',
       'installed_capacity_client', 'date_client', 'temperature_h_mean',
       'dewpoint_h_mean', 'rain_h_mean', 'snowfall_h_mean',
       'surface_pressure_h_mean', 'cloudcover_total_h_mean',
       'cloudcover_low_h_mean', 'cloudcover_mid_h_mean',
       'cloudcover_high_h_mean', 'windspeed_10m_h_mean',
       'winddirection_10m_h_mean', 'shortwave_radiation_h_mean',
       'direct_solar_radiation_h_mean', 'diffuse_radiation_h_mean', 'hour_h',
       'hours_ahead_f_mean', 'temperature_f_mean', 'dewpoint_f_mean',
       'cloudcover_high_f_mean', 'cloudcover_low_f_mean',
       'cloudcover_mid_f_mean', 'cloudcover_total_f_mean',
       '10_metre_u_wind_component_f_mean', '10_metre_v_wind_component_f_mean',
       

In [None]:
cons = cons[cons['is_consumption'] == 1].copy()
prod = cons[cons['is_consumption'] == 0].copy()
cons.shape, prod.shape

((1009176, 71), (1009176, 71))

In [91]:
for col in cons.select_dtypes(include=['object', 'category']).columns:
    print(f"{col}: {cons[col].nunique()} unique values")

county: 16 unique values
is_business: 2 unique values
product_type: 4 unique values
is_consumption: 1 unique values
data_block_id: 638 unique values
date_client: 636 unique values
origin_date_electricity: 15286 unique values
forecast_date_gas: 637 unique values
origin_date_gas: 637 unique values


In [92]:
columns_to_drop = ['data_block_id', 'date_client', 'origin_date_electricity', 'forecast_date_gas', 'origin_date_gas']  # Replace with actual column names
cons = cons.drop(columns=columns_to_drop)

In [93]:

########################################
# STEP 2: Apply "non-learned" data transformations
########################################

# Limited to these 4 feature transformations

# one hot encode categorical columns (increases vc dim)
cons = pd.get_dummies(cons)
print(f"cons.shape={cons.shape}")

# convert non-numeric columns to numeric (no effect on vd dim)
# le = sklearn.preprocessing.LabelEncoder()
# cons = cons[cons.columns[:]].apply(le.fit_transform)
# print(f"cons.shape={cons.shape}")

# apply the polynomial feature map (increases vc dim)
# poly = sklearn.preprocessing.PolynomialFeatures(3)
# cons = poly.fit_transform(cons)
# print(f"cons.shape={cons.shape}")

# apply a random projection (decreases vc dim)
# proj = sklearn.random_projection.GaussianRandomProjection(
#     n_components=120, # output dimension
#     random_state=42,
#     )
# cons = proj.fit_transform(cons)
# print(f"cons.shape={cons.shape}")

cons.shape=(1009176, 84)


In [94]:
########################################
# STEP 3: Create train/test sets
########################################

target = cons['target']/cons['installed_capacity_client']
cons = cons.drop(columns=['target', 'installed_capacity_client'])

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# ensure that the ratios sum to 1.0
epsilon = 1e-10
assert(1 - epsilon <= train_ratio + validation_ratio + test_ratio <= 1 + epsilon)

# create train0/test set
x_train0, x_test, y_train0, y_test = sklearn.model_selection.train_test_split(
    cons,
    target,
    test_size=test_ratio,
    random_state=0,
    )
print(f"len(x_train0)={len(x_train0)}")
print(f"len(x_test)={len(x_test)}")

# create train/validation set
x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(
    x_train0,
    y_train0,
    test_size=validation_ratio/(train_ratio + validation_ratio),
    random_state=0,
    )
print(f"len(x_train)={len(x_train)}")
print(f"len(x_val)={len(x_val)}")

KeyError: 'target'

In [None]:
# Preprocess datetime columns
datetime_columns = ['datetime', 'date', 'forecast_date_electricity']  # Replace with actual datetime column names

# Convert datetime columns to numeric features (e.g., year, month, day)
for col in datetime_columns:
    if col in x_train.columns:
        x_train[col + '_year'] = x_train[col].dt.year
        x_train[col + '_month'] = x_train[col].dt.month
        x_train[col + '_day'] = x_train[col].dt.day
        x_train = x_train.drop(columns=[col])  # Drop the original datetime column

        x_val[col + '_year'] = x_val[col].dt.year
        x_val[col + '_month'] = x_val[col].dt.month
        x_val[col + '_day'] = x_val[col].dt.day
        x_val = x_val.drop(columns=[col])

        x_test[col + '_year'] = x_test[col].dt.year
        x_test[col + '_month'] = x_test[col].dt.month
        x_test[col + '_day'] = x_test[col].dt.day
        x_test = x_test.drop(columns=[col])

# Ensure all columns are numeric
x_train = x_train.apply(pd.to_numeric, errors='coerce')
x_val = x_val.apply(pd.to_numeric, errors='coerce')
x_test = x_test.apply(pd.to_numeric, errors='coerce')



In [None]:
# Find indices with NaN in y_train, y_val, y_test
nan_indices_train = y_train[y_train.isna()].index
nan_indices_val = y_val[y_val.isna()].index
nan_indices_test = y_test[y_test.isna()].index

# Drop the corresponding rows from features and target
x_train = x_train.drop(index=nan_indices_train)
y_train = y_train.drop(index=nan_indices_train)

x_val = x_val.drop(index=nan_indices_val)
y_val = y_val.drop(index=nan_indices_val)

x_test = x_test.drop(index=nan_indices_test)
y_test = y_test.drop(index=nan_indices_test)

In [None]:
# Fill NaN with mean/median/mode for each column
for col in x_train.columns:
    # For numerical columns, use mean or median
    if x_train[col].dtype in ['int64', 'float64']:
        # Use median for robustness against outliers
        median_value = x_train[col].median()
        x_train[col] = x_train[col].fillna(median_value)
        x_val[col] = x_val[col].fillna(median_value)
        x_test[col] = x_test[col].fillna(median_value)
    else:
        # For categorical columns, use mode (most frequent value)
        mode_value = x_train[col].mode()[0]
        x_train[col] = x_train[col].fillna(mode_value)
        x_val[col] = x_val[col].fillna(mode_value)
        x_test[col] = x_test[col].fillna(mode_value)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [None]:
########################################
# STEP 5: Train a model
########################################

# NOTE:
# the models below are listed in the order we covered them in class;
# the parameters are listed in the order of the documentation;
# you are responsible for understanding how all specified parameters impact the runtime and/or statistical errors

# Most of our discussions in class is about "error"
# accuracy = 1 - error

model = xgb.XGBRegressor(
    booster='gbtree',
    objective='reg:squarederror',  # Objective for regression
    max_depth=9,
    min_child_weight=1,
    colsample_bytree=0.8,
    subsample=0.8,
    n_estimators=400,
    learning_rate=0.01,
    reg_lambda=1,
    reg_alpha=0,
    random_state=42
)


model.fit(x_train, y_train)

# Make predictions
y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)

# Calculate R² scores
validation_r2 = model.score(x_val, y_val)
train_r2 = model.score(x_train, y_train)

# Calculate Mean Absolute Error
train_mae = mean_absolute_error(y_train, y_train_pred)
validation_mae = mean_absolute_error(y_val, y_val_pred)

# Calculate Root Mean Squared Error
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
validation_rmse = mean_squared_error(y_val, y_val_pred, squared=False)

# Print all metrics
print(f"Training metrics:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  MAE: {train_mae:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print("\nValidation metrics:")
print(f"  R² Score: {validation_r2:.4f}")
print(f"  MAE: {validation_mae:.4f}")
print(f"  RMSE: {validation_rmse:.4f}")


validation_accuracy=0.9579
train_accuracy=0.9734


In [95]:
target_mean = y_train.mean()
target_std = y_train.std()
print(f"Target mean: {target_mean:.4f}, std: {target_std:.4f}")
print(f"MAE: {validation_mae:.4f}")
print(f"MAE as % of mean: {(validation_mae/abs(target_mean))*100:.2f}%")

Target mean: 0.0625, std: 0.1372
MAE: 0.0165
MAE as % of mean: 26.36%


In [79]:
for col in prod.select_dtypes(include=['object', 'category']).columns:
    print(f"{col}: {prod[col].nunique()} unique values")

county: 16 unique values
is_business: 2 unique values
product_type: 4 unique values
is_consumption: 1 unique values
data_block_id: 638 unique values
date_client: 636 unique values
origin_date_electricity: 15286 unique values
forecast_date_gas: 637 unique values
origin_date_gas: 637 unique values


In [80]:
columns_to_drop = ['data_block_id', 'date_client', 'origin_date_electricity', 'forecast_date_gas', 'origin_date_gas']  # Replace with actual column names
prod = prod.drop(columns=columns_to_drop)

In [81]:

########################################
# STEP 2: Apply "non-learned" data transformations
########################################

# Limited to these 4 feature transformations

# one hot encode categorical columns (increases vc dim)
prod = pd.get_dummies(prod)
print(f"cons.shape={prod.shape}")

# convert non-numeric columns to numeric (no effect on vd dim)
# le = sklearn.preprocessing.LabelEncoder()
# cons = cons[cons.columns[:]].apply(le.fit_transform)
# print(f"cons.shape={cons.shape}")

# apply the polynomial feature map (increases vc dim)
# poly = sklearn.preprocessing.PolynomialFeatures(3)
# cons = poly.fit_transform(cons)
# print(f"cons.shape={cons.shape}")

# apply a random projection (decreases vc dim)
# proj = sklearn.random_projection.GaussianRandomProjection(
#     n_components=120, # output dimension
#     random_state=42,
#     )
# cons = proj.fit_transform(cons)
# print(f"cons.shape={cons.shape}")

cons.shape=(1009176, 86)


In [82]:
########################################
# STEP 3: Create train/test sets
########################################

target = prod['target']/prod['installed_capacity_client']
prod = prod.drop(columns=['target', 'installed_capacity_client'])

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# ensure that the ratios sum to 1.0
epsilon = 1e-10
assert(1 - epsilon <= train_ratio + validation_ratio + test_ratio <= 1 + epsilon)

# create train0/test set
x_train0, x_test, y_train0, y_test = sklearn.model_selection.train_test_split(
    prod,
    target,
    test_size=test_ratio,
    random_state=0,
    )
print(f"len(x_train0)={len(x_train0)}")
print(f"len(x_test)={len(x_test)}")

# create train/validation set
x_train, x_val, y_train, y_val = sklearn.model_selection.train_test_split(
    x_train0,
    y_train0,
    test_size=validation_ratio/(train_ratio + validation_ratio),
    random_state=0,
    )
print(f"len(x_train)={len(x_train)}")
print(f"len(x_val)={len(x_val)}")

len(x_train0)=908258
len(x_test)=100918
len(x_train)=756881
len(x_val)=151377


In [83]:
# Preprocess datetime columns
datetime_columns = ['datetime', 'date', 'forecast_date_electricity']  # Replace with actual datetime column names

# Convert datetime columns to numeric features (e.g., year, month, day)
for col in datetime_columns:
    if col in x_train.columns:
        x_train[col + '_year'] = x_train[col].dt.year
        x_train[col + '_month'] = x_train[col].dt.month
        x_train[col + '_day'] = x_train[col].dt.day
        x_train = x_train.drop(columns=[col])  # Drop the original datetime column

        x_val[col + '_year'] = x_val[col].dt.year
        x_val[col + '_month'] = x_val[col].dt.month
        x_val[col + '_day'] = x_val[col].dt.day
        x_val = x_val.drop(columns=[col])

        x_test[col + '_year'] = x_test[col].dt.year
        x_test[col + '_month'] = x_test[col].dt.month
        x_test[col + '_day'] = x_test[col].dt.day
        x_test = x_test.drop(columns=[col])

# Ensure all columns are numeric
x_train = x_train.apply(pd.to_numeric, errors='coerce')
x_val = x_val.apply(pd.to_numeric, errors='coerce')
x_test = x_test.apply(pd.to_numeric, errors='coerce')



In [84]:
# Find indices with NaN in y_train, y_val, y_test
nan_indices_train = y_train[y_train.isna()].index
nan_indices_val = y_val[y_val.isna()].index
nan_indices_test = y_test[y_test.isna()].index

# Drop the corresponding rows from features and target
x_train = x_train.drop(index=nan_indices_train)
y_train = y_train.drop(index=nan_indices_train)

x_val = x_val.drop(index=nan_indices_val)
y_val = y_val.drop(index=nan_indices_val)

x_test = x_test.drop(index=nan_indices_test)
y_test = y_test.drop(index=nan_indices_test)

In [85]:
# Fill NaN with mean/median/mode for each column
for col in x_train.columns:
    # For numerical columns, use mean or median
    if x_train[col].dtype in ['int64', 'float64']:
        # Use median for robustness against outliers
        median_value = x_train[col].median()
        x_train[col] = x_train[col].fillna(median_value)
        x_val[col] = x_val[col].fillna(median_value)
        x_test[col] = x_test[col].fillna(median_value)
    else:
        # For categorical columns, use mode (most frequent value)
        mode_value = x_train[col].mode()[0]
        x_train[col] = x_train[col].fillna(mode_value)
        x_val[col] = x_val[col].fillna(mode_value)
        x_test[col] = x_test[col].fillna(mode_value)

In [89]:
########################################
# STEP 5: Train a model
########################################

# NOTE:
# the models below are listed in the order we covered them in class;
# the parameters are listed in the order of the documentation;
# you are responsible for understanding how all specified parameters impact the runtime and/or statistical errors

# Most of our discussions in class is about "error"
# accuracy = 1 - error

model = xgb.XGBRegressor(
    booster='gbtree',
    objective='reg:squarederror',  # Objective for regression
    max_depth=9,
    min_child_weight=1,
    colsample_bytree=0.8,
    subsample=0.8,
    n_estimators=400,
    learning_rate=0.01,
    reg_lambda=1,
    reg_alpha=0,
    random_state=42
)


model.fit(x_train, y_train)

# Make predictions
y_train_pred = model.predict(x_train)
y_val_pred = model.predict(x_val)

# Calculate R² scores
validation_r2 = model.score(x_val, y_val)
train_r2 = model.score(x_train, y_train)

# Calculate Mean Absolute Error
train_mae = mean_absolute_error(y_train, y_train_pred)
validation_mae = mean_absolute_error(y_val, y_val_pred)

# Calculate Root Mean Squared Error
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
validation_rmse = mean_squared_error(y_val, y_val_pred, squared=False)

# Print all metrics
print(f"Training metrics:")
print(f"  R² Score: {train_r2:.4f}")
print(f"  MAE: {train_mae:.4f}")
print(f"  RMSE: {train_rmse:.4f}")
print("\nValidation metrics:")
print(f"  R² Score: {validation_r2:.4f}")
print(f"  MAE: {validation_mae:.4f}")
print(f"  RMSE: {validation_rmse:.4f}")

Training metrics:
  R² Score: 0.9267
  MAE: 0.0157
  RMSE: 0.0372

Validation metrics:
  R² Score: 0.9109
  MAE: 0.0165
  RMSE: 0.0410




In [90]:
target_mean = y_train.mean()
target_std = y_train.std()
print(f"Target mean: {target_mean:.4f}, std: {target_std:.4f}")
print(f"MAE: {validation_mae:.4f}")
print(f"MAE as % of mean: {(validation_mae/abs(target_mean))*100:.2f}%")

Target mean: 0.0625, std: 0.1372
MAE: 0.0165
MAE as % of mean: 26.36%


In [None]:
########################################
# STEP 6: Evaluate on test set (Won't be run on test)
########################################

# WARNING:
# this code should be run only once;
# after the hyperparameters have been decided based on the validation performance,
# then the False can be changed to True to run this code
if False:
    model.fit(x_train0, y_train0)
    test_accuracy = model.score(x_test, y_test)
    print(f"test_accuracy={test_accuracy}")