In [49]:
#!pip install xgboost
#!pip install hyperopt

In [50]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from baeysianOptimization import BayesianOptimization



pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [51]:
x_train = pd.read_csv('cleaned_and_combined_data/x_train_combined.csv')
y_train = pd.read_csv('cleaned_and_combined_data/y_train_combined.csv')
x_test = pd.read_csv('cleaned_and_combined_data/x_test_combined.csv')


In [52]:
# Add observed and estimated columns
x_train['observed'] = x_train['calc_year'].isna().astype(int)
x_train['estimated'] = x_train['calc_year'].notna().astype(int)
x_test['observed'] = x_test['calc_year'].isna().astype(int)
x_test['estimated'] = x_test['calc_year'].notna().astype(int)

# Split the data based on null values in the calc_year column
x_train_df, x_valid_df, y_train_df, y_valid_df = train_test_split(x_train, y_train, test_size = 0.3, random_state = 0)


# Drop the specified columns
columns_to_drop = ['calc_year', 'calc_month', 'calc_day', 'calc_hour','date_forecast']
#columns_to_drop+=(['forecast_day','forecast_month','forecast_hour'])
columns_to_drop+=(['hour_sin','hour_cos','month_sin','month_cos','day_of_year_sin','day_of_year_cos'])
x_train_df.drop(columns=columns_to_drop, inplace=True)
x_valid_df.drop(columns=columns_to_drop, inplace=True)
x_test.drop(columns=columns_to_drop, inplace=True)

# For cross-validation and training after validation
combined_xdf = pd.concat([x_train_df, x_valid_df], axis=0).reset_index(drop=True)
combined_ydf = pd.concat([y_train_df, y_valid_df], axis=0).reset_index(drop=True)

# Check the head of the test data for verification
x_test.head()


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,day_of_year,direct_rad_3h_roll_avg,diffuse_rad_3h_roll_avg,direct_rad_6h_roll_avg,diffuse_rad_6h_roll_avg,direct_rad_x_sun_elevation,location_A,location_B,location_C,observed,estimated
0,4.325,1.28675,912.7,0.0,0.0,1061.55,0.0,271.65002,0.0,0.0,0.0,0.0,74.95,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.675,0.0,0.0,1000.55,1006.8,0.0,0.0,80.275,1013.1,,0.0,0.0,0.0,0.0,16.0265,-10.541,0.0,273.8,74.95,29907.5,3.95,2.1,3.35,0.0,2023,5,1,0,121,0.0,0.0,0.0,0.0,-0.0,1,0,0,0,1
1,4.275,1.286,1482.1,0.0,0.0,1075.1001,0.0,271.45,0.0,0.0,0.0,0.0,77.475,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.15,0.0,0.0,1000.05,1006.3,0.0,0.0,79.825,1012.6,,0.0,0.0,0.0,0.0,30.49725,-7.8945,0.0,273.8,77.475,29519.074,3.825,1.925,3.3,0.0,2023,5,1,1,121,0.0,0.0,0.0,0.0,-0.0,1,0,0,0,1
2,4.15,1.28375,1791.3,0.0,0.0,1200.4,0.0,271.05,0.0,0.0,0.0,0.0,88.1,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1012.675,0.0,0.0,999.5,1005.8,0.0,0.0,78.0,1012.05,,0.0,0.0,0.0,0.0,44.51725,-3.8155,0.0,273.84998,88.1,31009.125,3.65,1.75,3.2,0.0,2023,5,1,2,121,0.0,0.0,0.0,0.0,-0.0,1,0,0,0,1
3,4.025,1.282,2312.875,40497.7,11.675,1179.85,0.0,270.65,9.375,67380.91,2.1,15061.4,68.6,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1012.175,0.0,0.0,998.975,1005.225,0.0,0.0,75.625,1011.525,,0.0,0.0,0.0,0.0,58.083,1.4125,0.0,273.9,68.6,34552.5,3.5,1.45,3.15,0.0,2023,5,1,3,121,0.7,3.125,0.525,2.34375,2.96625,1,0,0,0,1
4,3.9,1.281,2198.2998,566994.4,76.875,920.05,0.0,270.375,47.4,408838.8,25.45,198284.8,66.3,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1011.725,0.0,0.0,998.55,1004.75,0.0,0.0,74.225,1011.05,,0.0,0.0,0.0,0.0,71.341,7.4685,0.0,273.925,66.3,35483.875,3.325,1.3,3.05,0.0,2023,5,1,4,121,9.183334,18.925001,5.51,11.355,190.07333,1,0,0,0,1


In [53]:
class BayesianOptimization:
    def __init__(self, x_train, y_train, x_valid, y_valid, space):
        self.x_train = x_train
        self.y_train = y_train
        self.x_valid = x_valid
        self.y_valid = y_valid
        self.space = space
        self.trials = Trials()

    def objective(self, params):
        model = xgb.XGBRegressor(**params, early_stopping_rounds=50)
        model.fit(self.x_train, self.y_train, eval_set=[(self.x_valid, self.y_valid)], verbose=False)
        pred = model.predict(self.x_valid)
        mae = mean_absolute_error(self.y_valid, pred)
        return {'loss': mae, 'status': STATUS_OK}

    def optimize(self, n_evals=100):
        best = fmin(fn=self.objective,
                    space=self.space,
                    algo=tpe.suggest,
                    max_evals=n_evals,
                    trials=self.trials)
        return best

    def get_best_params(self):
        return space_eval(self.space, self.trials.argmin)


In [54]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'max_depth': hp.choice('max_depth', range(1, 15, 1)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'alpha': hp.loguniform('alpha', -5, 2),  # exponential distribution over [0.01, 7.4]
    'lambda': hp.loguniform('lambda', -5, 2), 
    'n_estimators': hp.choice('n_estimators', range(100, 1000, 50))
}


# # Create an instance of the optimizer
# optimizer = BayesianOptimization(x_train_df, y_train_df['pv_measurement'], x_valid_df, y_valid_df['pv_measurement'], space)

# # Optimize
# best_params = optimizer.optimize()

# optimizer.get_best_params()

In [55]:
from sklearn.preprocessing import MinMaxScaler

# Exclude boolean columns (location_A, location_B, location_C)
columns_to_scale = x_train.columns.difference(['location_A', 'location_B', 'location_C'])

scaler = MinMaxScaler()
x_train[columns_to_scale] = scaler.fit_transform(x_train[columns_to_scale])
x_test[columns_to_scale] = scaler.transform(x_test[columns_to_scale])


# Cross-Validation

# Lag features

In [59]:
dtrain_whole = xgb.DMatrix(combined_xdf,label = combined_ydf['pv_measurement'])
dtrain = xgb.DMatrix(x_train_df,label = y_train_df['pv_measurement'])
dval = xgb.DMatrix(x_valid_df,label = y_valid_df['pv_measurement'])
test = xgb.DMatrix(x_test)

In [60]:
fixed_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'booster': 'gbtree'
}
variable_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'alpha': 0.1,
    'lambda': 1,
}

all_params = {**fixed_params,**variable_params} 
#  Train the model
num_boost_round = 10000  # Maximum number of boosting rounds
early_stopping_rounds = 50  # Stop if validation score doesn't improve for 50 rounds

evals = [(dtrain, 'train'), (dval, 'eval')]
progress = {}


In [61]:
bst = xgb.train(
    all_params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    evals_result=progress
)

print("Best MAE: {:.2f} with {} rounds".format(
         bst.best_score,
         bst.best_iteration+1))


[0]	train-mae:430.21176	eval-mae:430.81288
[1]	train-mae:410.44635	eval-mae:411.03265
[2]	train-mae:391.73151	eval-mae:392.37869
[3]	train-mae:374.12450	eval-mae:374.75271
[4]	train-mae:357.58574	eval-mae:358.28632
[5]	train-mae:342.00300	eval-mae:342.72109
[6]	train-mae:327.33480	eval-mae:328.08792
[7]	train-mae:313.63025	eval-mae:314.43627
[8]	train-mae:300.73169	eval-mae:301.66094
[9]	train-mae:288.57162	eval-mae:289.61382


[10]	train-mae:277.16297	eval-mae:278.30952
[11]	train-mae:266.49147	eval-mae:267.64566
[12]	train-mae:256.46329	eval-mae:257.69400
[13]	train-mae:246.98128	eval-mae:248.32944
[14]	train-mae:238.07724	eval-mae:239.51394
[15]	train-mae:229.73398	eval-mae:231.32974
[16]	train-mae:221.88423	eval-mae:223.63578
[17]	train-mae:214.46645	eval-mae:216.38419
[18]	train-mae:207.48662	eval-mae:209.49479
[19]	train-mae:200.90722	eval-mae:203.06514
[20]	train-mae:194.74904	eval-mae:196.98856
[21]	train-mae:188.93914	eval-mae:191.32253
[22]	train-mae:183.49187	eval-mae:185.94101
[23]	train-mae:178.31923	eval-mae:180.87316
[24]	train-mae:173.45151	eval-mae:176.15235
[25]	train-mae:168.89981	eval-mae:171.79050
[26]	train-mae:164.58590	eval-mae:167.60887
[27]	train-mae:160.57217	eval-mae:163.66078
[28]	train-mae:156.72819	eval-mae:159.93829
[29]	train-mae:153.11405	eval-mae:156.42346
[30]	train-mae:149.65997	eval-mae:153.07246
[31]	train-mae:146.43122	eval-mae:149.96321
[32]	train-mae:143.37314	eval-ma

In [62]:
# Using the best iteration from previous training
best_iteration = bst.best_iteration + 1

# Retrain the model on the entire dataset
bst_whole = xgb.train(
    all_params,
    dtrain_whole,
    num_boost_round=best_iteration
)


In [63]:
predictions = bst_whole.predict(test)
predictions

array([-1.5106149 , -0.6375855 , -0.41436076, ...,  9.555201  ,
        2.2275333 , -1.4605165 ], dtype=float32)

In [64]:
sample_submission = pd.read_csv('sample_submission.csv')

# Convert the numpy array to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['prediction'])

# Convert all negative predictions to 0
predictions_df.loc[predictions_df['prediction'] < 0, 'prediction'] = 0

# Join the 'id' column from sample_submission with the predictions
sample_submission['prediction'] = predictions_df['prediction']

# Save to CSV
sample_submission.to_csv('xgboost.csv', index=False)
predictions_df

Unnamed: 0,prediction
0,0.000000
1,0.000000
2,0.000000
3,79.418022
4,352.832825
...,...
2155,48.357288
2156,23.512861
2157,9.555201
2158,2.227533
