In [153]:
#!pip install xgboost
#!pip install hyperopt

In [154]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error
from baeysianOptimization import BayesianOptimization

# # Exclude boolean columns (location_A, location_B, location_C)
# columns_to_scale = x_train.columns.difference(['location_A', 'location_B', 'location_C'])

# scaler = MinMaxScaler()
# x_train[columns_to_scale] = scaler.fit_transform(x_train[columns_to_scale])
# x_test[columns_to_scale] = scaler.transform(x_test[columns_to_scale])


pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [155]:
x_train = pd.read_csv('cleaned_and_combined_data/x_train_combined.csv')
y_train = pd.read_csv('cleaned_and_combined_data/y_train_combined.csv')
x_test = pd.read_csv('cleaned_and_combined_data/x_test_combined.csv')


In [156]:
# Add observed and estimated columns
x_train['observed'] = x_train['calc_year'].isna().astype(int)
x_train['estimated'] = x_train['calc_year'].notna().astype(int)
x_test['observed'] = x_test['calc_year'].isna().astype(int)
x_test['estimated'] = x_test['calc_year'].notna().astype(int)

# Split the data based on null values in the calc_year column
x_train_df = x_train[x_train['observed'] == 1].copy()
y_train_df = y_train[x_train['observed'] == 1].copy()

x_valid_df = x_train[x_train['estimated'] == 1].copy()
y_valid_df = y_train[x_train['estimated'] == 1].copy()


# Drop the specified columns
columns_to_drop = ['calc_year', 'calc_month', 'calc_day', 'calc_hour', 'date_forecast']
x_train_df.drop(columns=columns_to_drop, inplace=True)
x_valid_df.drop(columns=columns_to_drop, inplace=True)
x_test.drop(columns=columns_to_drop, inplace=True)

# For cross-validation and training after validation
combined_xdf = pd.concat([x_train_df, x_valid_df], axis=0).reset_index(drop=True)
combined_ydf = pd.concat([y_train_df, y_valid_df], axis=0).reset_index(drop=True)

# Check the head of the test data for verification
x_test.head()


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,elevation:m,fresh_snow_12h:cm,fresh_snow_1h:cm,fresh_snow_24h:cm,fresh_snow_3h:cm,fresh_snow_6h:cm,is_day:idx,is_in_shadow:idx,msl_pressure:hPa,precip_5min:mm,precip_type_5min:idx,pressure_100m:hPa,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sfc_pressure:hPa,snow_density:kgm3,snow_depth:cm,snow_drift:idx,snow_melt_10min:mm,snow_water:kgm2,sun_azimuth:d,sun_elevation:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,lag_1,lag_2,lag_3,location_A,location_B,location_C,observed,estimated
0,4.325,1.28675,912.7,0.0,0.0,1061.55,0.0,271.65002,0.0,0.0,0.0,0.0,74.95,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.675,0.0,0.0,1000.55,1006.8,0.0,0.0,80.275,1013.1,,0.0,0.0,0.0,0.0,16.0265,-10.541,0.0,273.8,74.95,29907.5,3.95,2.1,3.35,0.0,2023,5,1,0,0.0,0.0,0.0,1,0,0,0,1
1,4.275,1.286,1482.1,0.0,0.0,1075.1001,0.0,271.45,0.0,0.0,0.0,0.0,77.475,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1013.15,0.0,0.0,1000.05,1006.3,0.0,0.0,79.825,1012.6,,0.0,0.0,0.0,0.0,30.49725,-7.8945,0.0,273.8,77.475,29519.074,3.825,1.925,3.3,0.0,2023,5,1,1,0.0,0.0,0.0,1,0,0,0,1
2,4.15,1.28375,1791.3,0.0,0.0,1200.4,0.0,271.05,0.0,0.0,0.0,0.0,88.1,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1012.675,0.0,0.0,999.5,1005.8,0.0,0.0,78.0,1012.05,,0.0,0.0,0.0,0.0,44.51725,-3.8155,0.0,273.84998,88.1,31009.125,3.65,1.75,3.2,0.0,2023,5,1,2,0.0,0.0,0.0,1,0,0,0,1
3,4.025,1.282,2312.875,40497.7,11.675,1179.85,0.0,270.65,9.375,67380.91,2.1,15061.4,68.6,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1012.175,0.0,0.0,998.975,1005.225,0.0,0.0,75.625,1011.525,,0.0,0.0,0.0,0.0,58.083,1.4125,0.0,273.9,68.6,34552.5,3.5,1.45,3.15,0.0,2023,5,1,3,21.78,55.22,18.92,1,0,0,0,1
4,3.9,1.281,2198.2998,566994.4,76.875,920.05,0.0,270.375,47.4,408838.8,25.45,198284.8,66.3,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1011.725,0.0,0.0,998.55,1004.75,0.0,0.0,74.225,1011.05,,0.0,0.0,0.0,0.0,71.341,7.4685,0.0,273.925,66.3,35483.875,3.325,1.3,3.05,0.0,2023,5,1,4,82.72,176.44,92.84,1,0,0,0,1


In [157]:
class BayesianOptimization:
    def __init__(self, x_train, y_train, x_valid, y_valid, space):
        self.x_train = x_train
        self.y_train = y_train
        self.x_valid = x_valid
        self.y_valid = y_valid
        self.space = space
        self.trials = Trials()

    def objective(self, params):
        model = xgb.XGBRegressor(**params, early_stopping_rounds=50)
        model.fit(self.x_train, self.y_train, eval_set=[(self.x_valid, self.y_valid)], verbose=False)
        pred = model.predict(self.x_valid)
        mae = mean_absolute_error(self.y_valid, pred)
        return {'loss': mae, 'status': STATUS_OK}

    def optimize(self, n_evals=100):
        best = fmin(fn=self.objective,
                    space=self.space,
                    algo=tpe.suggest,
                    max_evals=n_evals,
                    trials=self.trials)
        return best

    def get_best_params(self):
        return space_eval(self.space, self.trials.argmin)


In [158]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'max_depth': hp.choice('max_depth', range(1, 15, 1)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'alpha': hp.loguniform('alpha', -5, 2),  # exponential distribution over [0.01, 7.4]
    'lambda': hp.loguniform('lambda', -5, 2), 
    'n_estimators': hp.choice('n_estimators', range(100, 1000, 50))
}


# # Create an instance of the optimizer
# optimizer = BayesianOptimization(x_train_df, y_train_df['pv_measurement'], x_valid_df, y_valid_df['pv_measurement'], space)

# # Optimize
# best_params = optimizer.optimize()

# optimizer.get_best_params()

In [159]:
from sklearn.preprocessing import MinMaxScaler

# Exclude boolean columns (location_A, location_B, location_C)
# columns_to_scale = x_train.columns.difference(['location_A', 'location_B', 'location_C'])

# scaler = MinMaxScaler()
# x_train[columns_to_scale] = scaler.fit_transform(x_train[columns_to_scale])
# x_test[columns_to_scale] = scaler.transform(x_test[columns_to_scale])


# Cross-Validation

In [160]:
from sklearn.model_selection import TimeSeriesSplit

In [161]:
tss = TimeSeriesSplit(n_splits=5, test_size= 24*30)

In [162]:
for train_idx, val_idx in tss.split(combined_xdf):
    train = combined_xdf.iloc[train_idx]
    val = combined_xdf.iloc[val_idx]
    

# Lag features

In [163]:
dtrain_whole = xgb.DMatrix(combined_xdf,label = combined_ydf['pv_measurement'])
dtrain = xgb.DMatrix(x_train_df,label = y_train_df['pv_measurement'])
dval = xgb.DMatrix(x_valid_df,label = y_valid_df['pv_measurement'])
test = xgb.DMatrix(x_test)

In [164]:
fixed_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'mae',
        'booster': 'gbtree'
}
variable_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'alpha': 0.1,
    'lambda': 1,
}

all_params = {**fixed_params,**variable_params} 
#  Train the model
num_boost_round = 1000  # Maximum number of boosting rounds
early_stopping_rounds = 50  # Stop if validation score doesn't improve for 50 rounds

evals = [(dtrain, 'train'), (dval, 'eval')]
progress = {}


In [165]:
bst = xgb.train(
    all_params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=evals,
    early_stopping_rounds=early_stopping_rounds,
    evals_result=progress
)

print("Best MAE: {:.2f} with {} rounds".format(
         bst.best_score,
         bst.best_iteration+1))


[0]	train-mae:454.51620	eval-mae:377.31091
[1]	train-mae:433.63266	eval-mae:360.39286
[2]	train-mae:414.05230	eval-mae:344.33296
[3]	train-mae:395.48269	eval-mae:329.25884
[4]	train-mae:377.94002	eval-mae:315.02515
[5]	train-mae:363.38177	eval-mae:302.21025
[6]	train-mae:347.79171	eval-mae:289.51127
[7]	train-mae:333.17591	eval-mae:277.70328
[8]	train-mae:319.47568	eval-mae:266.45531
[9]	train-mae:306.68519	eval-mae:255.89660
[10]	train-mae:294.50026	eval-mae:245.87561


[11]	train-mae:283.11946	eval-mae:236.45928
[12]	train-mae:272.53418	eval-mae:227.44739
[13]	train-mae:262.45756	eval-mae:218.92428
[14]	train-mae:252.98248	eval-mae:210.83073
[15]	train-mae:243.97426	eval-mae:202.97143
[16]	train-mae:235.66058	eval-mae:195.71724
[17]	train-mae:227.76404	eval-mae:188.76877
[18]	train-mae:221.14591	eval-mae:182.29576
[19]	train-mae:214.09304	eval-mae:176.06238
[20]	train-mae:207.48795	eval-mae:170.15669
[21]	train-mae:201.27018	eval-mae:164.70881
[22]	train-mae:195.36214	eval-mae:159.43555
[23]	train-mae:189.85070	eval-mae:154.33487
[24]	train-mae:184.65912	eval-mae:149.51849
[25]	train-mae:179.79862	eval-mae:145.03669
[26]	train-mae:175.19727	eval-mae:140.59681
[27]	train-mae:170.83927	eval-mae:136.47958
[28]	train-mae:166.80316	eval-mae:132.51308
[29]	train-mae:162.87350	eval-mae:128.90401
[30]	train-mae:159.23118	eval-mae:125.45848
[31]	train-mae:155.76228	eval-mae:122.26901
[32]	train-mae:152.42340	eval-mae:119.24920
[33]	train-mae:149.34876	eval-ma

In [166]:
# Using the best iteration from previous training
best_iteration = bst.best_iteration + 1

# Retrain the model on the entire dataset
bst_whole = xgb.train(
    all_params,
    dtrain_whole,
    num_boost_round=best_iteration
)


In [167]:
predictions = bst_whole.predict(test)
predictions

array([-0.34900874, -0.34900874, -0.5380868 , ..., -2.5781448 ,
       -2.5963187 , -3.549993  ], dtype=float32)

In [168]:
sample_submission = pd.read_csv('sample_submission.csv')

# Convert the numpy array to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['prediction'])

# Convert all negative predictions to 0
predictions_df.loc[predictions_df['prediction'] < 0, 'prediction'] = 0

# Join the 'id' column from sample_submission with the predictions
sample_submission['prediction'] = predictions_df['prediction']

# Save to CSV
sample_submission.to_csv('my_first_submission.csv', index=False)
predictions_df

Unnamed: 0,prediction
0,0.000000
1,0.000000
2,0.000000
3,73.614899
4,345.666046
...,...
2155,37.235603
2156,11.913455
2157,0.000000
2158,0.000000
