In [642]:
#!pip install lightgbm

In [643]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [644]:
x_train_a = pd.read_csv('cleaned_data/A/x_train_a.csv')
y_train_a = pd.read_csv('cleaned_data/A/train_a.csv')
x_test_a = pd.read_csv('cleaned_data/A/x_test_a.csv')

In [645]:
x_train_b = pd.read_csv('cleaned_data/B/x_train_b.csv')
y_train_b = pd.read_csv('cleaned_data/B/train_b.csv')
x_test_b = pd.read_csv('cleaned_data/B/x_test_b.csv')

In [646]:
x_train_c = pd.read_csv('cleaned_data/C/x_train_c.csv')
y_train_c = pd.read_csv('cleaned_data/C/train_c.csv')
x_test_c = pd.read_csv('cleaned_data/C/x_test_c.csv')

In [647]:
def prepare_dataset(x_train, y_train, x_test, merge_on, drop_columns):
    # Merge training data with the target variable
    x_train_combined = x_train.merge(y_train, left_on=merge_on[0], right_on=merge_on[1], how='left')
    # Create 'observed' column based on the 'calc_year' column's NaN values
    x_train_combined['observed'] = x_train_combined['calc_year'].isna().astype(int)
    # Drop specified columns from the training data
    train_data = x_train_combined.drop(drop_columns, axis=1).drop(['time'],axis = 1)
    # Repeat the process for test data
    x_test['observed'] = x_test['calc_year'].isna().astype(int)
    test_data = x_test.drop(drop_columns, axis=1).drop(['location'],axis=1)
    
    return train_data, test_data

# Define columns to drop from training and test datasets
drop_columns = [ 'calc_year', 'calc_month', 'calc_day', 'calc_hour',
                      'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
                      'day_of_year_sin', 'day_of_year_cos', 'direct_rad_3h_roll_avg',
                      'diffuse_rad_3h_roll_avg', 'direct_rad_6h_roll_avg',
                      'diffuse_rad_6h_roll_avg', 'day_of_year', 'date_forecast']



# Call the function with the relevant dataframes and columns
train_data_a, test_data_a = prepare_dataset(x_train_a, y_train_a, x_test_a, 
                                        ['date_forecast', 'time'], 
                                        drop_columns)
train_data_b, test_data_b = prepare_dataset(x_train_b, y_train_b, x_test_b, 
                                        ['date_forecast', 'time'], 
                                        drop_columns)
train_data_c, test_data_c = prepare_dataset(x_train_c, y_train_c, x_test_c, 
                                        ['date_forecast', 'time'], 
                                        drop_columns)



In [648]:
test_data_b

Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,direct_rad_x_sun_elevation,observed
0,4.300,1.28300,912.3000,0.0,0.000,1059.750,0.0,271.65002,0.000,0.000,...,3.950,2.100,3.375,0.0,2023,5,1,0,-0.000000,0
1,4.250,1.28300,1482.8002,0.0,0.000,1073.700,0.0,271.45000,0.000,0.000,...,3.825,1.925,3.300,0.0,2023,5,1,1,-0.000000,0
2,4.150,1.28275,1765.9000,0.0,0.000,1200.100,0.0,271.05000,0.000,0.000,...,3.650,1.750,3.225,0.0,2023,5,1,2,-0.000000,0
3,4.025,1.28225,2269.7500,40510.2,11.675,1179.000,0.0,270.65000,9.375,67382.305,...,3.500,1.475,3.150,0.0,2023,5,1,3,2.967300,0
4,3.900,1.28200,2198.2250,567057.1,76.900,919.150,0.0,270.37500,47.400,408812.200,...,3.325,1.300,3.075,0.0,2023,5,1,4,190.086060,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,8.350,1.19800,3640.1250,1908360.9,85.100,2015.750,0.0,281.57500,33.625,675011.000,...,2.475,2.075,-1.350,0.0,2023,7,3,19,109.454025,0
716,8.525,1.20075,3351.1000,737351.8,24.800,1613.375,0.0,281.85000,14.350,345239.800,...,2.450,2.100,-1.275,0.0,2023,7,3,20,11.602287,0
717,8.800,1.20375,2753.0250,149728.8,1.275,1624.450,0.0,282.30000,1.300,112669.700,...,2.575,2.150,-1.400,0.0,2023,7,3,21,-0.000000,0
718,9.000,1.20600,2204.5000,1440.5,0.000,1768.325,0.0,282.67502,0.000,9413.900,...,2.250,1.800,-1.350,0.0,2023,7,3,22,-0.000000,0


In [649]:
def clean_column_names(dataframe):
    # Replace any special JSON characters with an underscore (or remove them)
    clean_names = {col: col.replace(':', '_').replace(',', '_').replace('{', '_').replace('}', '_')
                   .replace('[', '_').replace(']', '_') for col in dataframe.columns}
    return dataframe.rename(columns=clean_names)

# Clean the column names for both train and test datasets
train_data_a = clean_column_names(train_data_a)
test_data_a = clean_column_names(test_data_a)
train_data_b = clean_column_names(train_data_b)
test_data_b = clean_column_names(test_data_b)
train_data_c = clean_column_names(train_data_c)
test_data_c = clean_column_names(test_data_c)

In [650]:
# Assuming you have a DataFrame `df` with features and a target column named 'target'
X_a = train_data_a.drop('pv_measurement', axis=1)
y_a = train_data_a['pv_measurement']

X_b = train_data_b.drop('pv_measurement', axis=1)
y_b = train_data_b['pv_measurement']

X_c = train_data_c.drop('pv_measurement', axis=1)
y_c = train_data_c['pv_measurement']

In [651]:
split_ratio = 0.875  # 87.5% for the first part


def train_val_split(x,y,split_ratio):
    x_train = x[:int(len(x) * split_ratio)]
    x_val = x[int(len(x) * split_ratio):]
    y_train = y[:int(len(y) * split_ratio)]
    y_val = y[int(len(y) * split_ratio):]

    return x_train,x_val,y_train,y_val

X_train_a, X_val_a, y_train_a, y_val_a = train_val_split(X_a,y_a,split_ratio)
X_train_b, X_val_b, y_train_b, y_val_b = train_val_split(X_b,y_b,split_ratio)
X_train_c, X_val_c, y_train_c, y_val_c = train_val_split(X_c,y_c,split_ratio)


In [652]:
train_data_a = lgb.Dataset(X_train_a, label=y_train_a)
val_data_a = lgb.Dataset(X_val_a, label=y_val_a, reference=train_data_a)
train_data_b = lgb.Dataset(X_train_b, label=y_train_b)
val_data_b = lgb.Dataset(X_val_b, label=y_val_b, reference=train_data_b)
train_data_c = lgb.Dataset(X_train_c, label=y_train_c)
val_data_c = lgb.Dataset(X_val_c, label=y_val_c, reference=train_data_c)

In [653]:

# Define the parameters for the LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',  # Use MAE for evaluation
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


In [654]:
# Train the model
gbm_a = lgb.train(params,
                train_data_a,
                num_boost_round=10000,
                valid_sets=val_data_a,
                callbacks=[lgb.early_stopping(stopping_rounds=10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[153]	valid_0's l1: 118.296


In [655]:
gbm_b = lgb.train(params,
                train_data_b,
                num_boost_round=10000,
                valid_sets=val_data_b,
                callbacks=[lgb.early_stopping(stopping_rounds=10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[149]	valid_0's l1: 16.8359


In [656]:
gbm_c = lgb.train(params,
                train_data_c,
                num_boost_round=10000,
                valid_sets=val_data_c,
                callbacks=[lgb.early_stopping(stopping_rounds=10)])

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[143]	valid_0's l1: 23.4166


In [657]:
y_pred_a = gbm_a.predict(test_data_a, num_iteration=gbm_a.best_iteration)
formatted_predictions_a = [format(x, '.5f') for x in y_pred_a]
y_pred_b = gbm_b.predict(test_data_b, num_iteration=gbm_b.best_iteration)
formatted_predictions_b = [format(x, '.5f') for x in y_pred_b]
y_pred_c = gbm_c.predict(test_data_c, num_iteration=gbm_c.best_iteration)
formatted_predictions_c = [format(x, '.5f') for x in y_pred_c]

formatted_predictions_a

['3.11523',
 '3.11523',
 '3.11523',
 '63.09452',
 '400.21175',
 '974.02988',
 '1993.07625',
 '2558.81081',
 '2905.81077',
 '3153.79540',
 '3082.04548',
 '3299.14206',
 '2796.85897',
 '2700.50100',
 '2592.68968',
 '1843.10734',
 '1152.51633',
 '604.22115',
 '312.31186',
 '16.00389',
 '2.53479',
 '2.53479',
 '2.61991',
 '3.00301',
 '4.33145',
 '4.33145',
 '4.33145',
 '157.07204',
 '667.82348',
 '1280.03357',
 '1785.61333',
 '2628.92280',
 '3908.28511',
 '4099.35093',
 '4113.86294',
 '4336.87680',
 '3827.81291',
 '3636.64022',
 '3333.67180',
 '2342.58826',
 '1733.58513',
 '844.29824',
 '434.89139',
 '81.97118',
 '3.33636',
 '3.33636',
 '3.18584',
 '0.05706',
 '2.29932',
 '3.23310',
 '13.46898',
 '266.39376',
 '614.64204',
 '1475.27790',
 '2805.90461',
 '3769.78845',
 '4407.64762',
 '4779.92741',
 '4963.59830',
 '4624.91396',
 '4409.42897',
 '3787.99395',
 '3050.10354',
 '2075.97642',
 '1221.50524',
 '675.20103',
 '437.48681',
 '141.34846',
 '1.78131',
 '2.11179',
 '2.19690',
 '2.49489',
 

In [658]:
formatted_predictions_c

['0.37525',
 '0.37525',
 '0.37525',
 '5.68009',
 '32.32349',
 '89.18459',
 '209.63758',
 '350.84327',
 '356.40661',
 '348.75852',
 '436.64214',
 '487.96151',
 '353.28845',
 '363.69717',
 '409.48458',
 '311.93361',
 '208.43335',
 '109.35706',
 '35.20130',
 '-0.77475',
 '0.43666',
 '0.43666',
 '0.43666',
 '0.37525',
 '0.28200',
 '0.28200',
 '0.28200',
 '10.93169',
 '57.09393',
 '117.72598',
 '220.88403',
 '320.27978',
 '422.23789',
 '603.84047',
 '585.20526',
 '611.61205',
 '572.79484',
 '522.48598',
 '446.72719',
 '375.75798',
 '252.29099',
 '132.00609',
 '51.42660',
 '4.41111',
 '1.77166',
 '0.34341',
 '0.34341',
 '0.28200',
 '0.28200',
 '0.28200',
 '0.28200',
 '28.86517',
 '85.20689',
 '184.84135',
 '347.02014',
 '520.30000',
 '655.83237',
 '732.25675',
 '793.40552',
 '744.65369',
 '744.51293',
 '638.61556',
 '501.57714',
 '364.20726',
 '232.77263',
 '135.79655',
 '70.97149',
 '16.83879',
 '0.70295',
 '0.20967',
 '0.20967',
 '0.14825',
 '0.37227',
 '0.37227',
 '0.98495',
 '24.19344',


In [659]:
predictions = formatted_predictions_a + formatted_predictions_b + formatted_predictions_c


In [660]:
sample_submission = pd.read_csv('sample_submission.csv')

# Convert the numpy array to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['prediction'])

# Convert all negative predictions to 0
predictions_df['prediction'] = predictions_df['prediction'].astype(float)
predictions_df.loc[predictions_df['prediction'] < 0, 'prediction'] = 0.0

# Join the 'id' column from sample_submission with the predictions
sample_submission['prediction'] = predictions_df['prediction']

# Save to CSV
sample_submission.to_csv('lightgbm.csv', index=False)
predictions_df

Unnamed: 0,prediction
0,3.11523
1,3.11523
2,3.11523
3,63.09452
4,400.21175
...,...
2155,38.99164
2156,12.44847
2157,0.37067
2158,0.20967
