In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
combined = pd.read_csv("./data/combined.csv",
                        parse_dates=[0], index_col=0)
combined

Unnamed: 0,solar_capacity,solar_generation_actual,wind_capacity,wind_generation_actual,windspeed_10m,radiation_direct_horizontal,radiation_diffuse_horizontal,precipitation,temperature,irradiance_surface,irradiance_toa,snowfall,snow_mass,cloud_cover,air_density
2014-01-01 00:00:00+00:00,36012,0.0,32425,8086.0,4.981596,0.0,0.0,0.0364,-1.146,0.0,0.0,0.0056,0.2378,0.9562,1.2539
2014-01-01 01:00:00+00:00,36012,0.0,32425,8803.0,4.932105,0.0,0.0,0.0449,-1.104,0.0,0.0,0.0074,0.2428,0.9672,1.2535
2014-01-01 02:00:00+00:00,36012,0.0,32425,8970.0,4.865019,0.0,0.0,0.0513,-1.059,0.0,0.0,0.0129,0.2521,0.9733,1.2530
2014-01-01 03:00:00+00:00,36012,0.0,32425,8510.0,4.842338,0.0,0.0,0.0544,-0.926,0.0,0.0,0.0140,0.2642,0.9651,1.2523
2014-01-01 04:00:00+00:00,36012,0.0,32425,8360.0,4.772505,0.0,0.0,0.0512,-0.746,0.0,0.0,0.0144,0.2764,0.9370,1.2516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00+00:00,40634,0.0,49021,15303.0,4.212916,0.0,0.0,0.0098,-1.970,0.0,0.0,0.0017,0.0931,0.5102,1.2751
2016-12-31 20:00:00+00:00,40634,0.0,49021,15354.0,4.287475,0.0,0.0,0.0099,-2.130,0.0,0.0,0.0020,0.0944,0.5235,1.2756
2016-12-31 21:00:00+00:00,40634,0.0,49021,14848.0,4.296530,0.0,0.0,0.0093,-2.293,0.0,0.0,0.0020,0.0959,0.5248,1.2759
2016-12-31 22:00:00+00:00,40634,0.0,49021,15056.0,4.419977,0.0,0.0,0.0093,-2.443,0.0,0.0,0.0021,0.0973,0.5319,1.2760


In [3]:
wind = combined[['wind_generation_actual', 'wind_capacity', 'windspeed_10m', 'radiation_direct_horizontal', 'radiation_diffuse_horizontal', 'irradiance_surface', 'irradiance_toa', 'precipitation', 'snowfall', 'cloud_cover', 'air_density']]
X_wind = wind[['wind_capacity', 'windspeed_10m', 'radiation_direct_horizontal', 'radiation_diffuse_horizontal', 'irradiance_surface', 'irradiance_toa', 'precipitation', 'snowfall', 'cloud_cover', 'air_density']]
y_wind = wind['wind_generation_actual']
wind

Unnamed: 0,wind_generation_actual,wind_capacity,windspeed_10m,radiation_direct_horizontal,radiation_diffuse_horizontal,irradiance_surface,irradiance_toa,precipitation,snowfall,cloud_cover,air_density
2014-01-01 00:00:00+00:00,8086.0,32425,4.981596,0.0,0.0,0.0,0.0,0.0364,0.0056,0.9562,1.2539
2014-01-01 01:00:00+00:00,8803.0,32425,4.932105,0.0,0.0,0.0,0.0,0.0449,0.0074,0.9672,1.2535
2014-01-01 02:00:00+00:00,8970.0,32425,4.865019,0.0,0.0,0.0,0.0,0.0513,0.0129,0.9733,1.2530
2014-01-01 03:00:00+00:00,8510.0,32425,4.842338,0.0,0.0,0.0,0.0,0.0544,0.0140,0.9651,1.2523
2014-01-01 04:00:00+00:00,8360.0,32425,4.772505,0.0,0.0,0.0,0.0,0.0512,0.0144,0.9370,1.2516
...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00+00:00,15303.0,49021,4.212916,0.0,0.0,0.0,0.0,0.0098,0.0017,0.5102,1.2751
2016-12-31 20:00:00+00:00,15354.0,49021,4.287475,0.0,0.0,0.0,0.0,0.0099,0.0020,0.5235,1.2756
2016-12-31 21:00:00+00:00,14848.0,49021,4.296530,0.0,0.0,0.0,0.0,0.0093,0.0020,0.5248,1.2759
2016-12-31 22:00:00+00:00,15056.0,49021,4.419977,0.0,0.0,0.0,0.0,0.0093,0.0021,0.5319,1.2760


In [4]:
# We'll train the machine learning algorithms on the data for the years 2014-2015
X_wind_train = X_wind.loc[(X_wind.index >= "2014-01-01 00:00:00") & (X_wind.index <= "2015-12-31 23:00:00")]
y_wind_train = y_wind.loc[(X_wind.index >= "2014-01-01 00:00:00") & (X_wind.index <= "2015-12-31 23:00:00")]

In [5]:
# We'll test the models on the data for the year 2016
X_wind_test = X_wind.loc[(X_wind.index >= "2016-01-01 00:00:00") & (X_wind.index <= "2016-12-31 23:00:00")]
y_wind_test = y_wind.loc[(X_wind.index >= "2016-01-01 00:00:00") & (X_wind.index <= "2016-12-31 23:00:00")]

In [6]:
# Same procedure for solar
solar = combined[['solar_generation_actual', 'solar_capacity', 'windspeed_10m', 'radiation_direct_horizontal', 'radiation_diffuse_horizontal', 'irradiance_surface', 'irradiance_toa', 'precipitation', 'snowfall', 'cloud_cover', 'air_density']]
X_solar = solar[['solar_capacity','windspeed_10m', 'radiation_direct_horizontal', 'radiation_diffuse_horizontal', 'irradiance_surface', 'irradiance_toa', 'precipitation', 'snowfall', 'cloud_cover', 'air_density']]
y_solar = solar['solar_generation_actual']
solar

Unnamed: 0,solar_generation_actual,solar_capacity,windspeed_10m,radiation_direct_horizontal,radiation_diffuse_horizontal,irradiance_surface,irradiance_toa,precipitation,snowfall,cloud_cover,air_density
2014-01-01 00:00:00+00:00,0.0,36012,4.981596,0.0,0.0,0.0,0.0,0.0364,0.0056,0.9562,1.2539
2014-01-01 01:00:00+00:00,0.0,36012,4.932105,0.0,0.0,0.0,0.0,0.0449,0.0074,0.9672,1.2535
2014-01-01 02:00:00+00:00,0.0,36012,4.865019,0.0,0.0,0.0,0.0,0.0513,0.0129,0.9733,1.2530
2014-01-01 03:00:00+00:00,0.0,36012,4.842338,0.0,0.0,0.0,0.0,0.0544,0.0140,0.9651,1.2523
2014-01-01 04:00:00+00:00,0.0,36012,4.772505,0.0,0.0,0.0,0.0,0.0512,0.0144,0.9370,1.2516
...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:00:00+00:00,0.0,40634,4.212916,0.0,0.0,0.0,0.0,0.0098,0.0017,0.5102,1.2751
2016-12-31 20:00:00+00:00,0.0,40634,4.287475,0.0,0.0,0.0,0.0,0.0099,0.0020,0.5235,1.2756
2016-12-31 21:00:00+00:00,0.0,40634,4.296530,0.0,0.0,0.0,0.0,0.0093,0.0020,0.5248,1.2759
2016-12-31 22:00:00+00:00,0.0,40634,4.419977,0.0,0.0,0.0,0.0,0.0093,0.0021,0.5319,1.2760


In [7]:
# We'll train the machine learning algorithms on the data for the years 2014-2015
X_solar_train = X_solar.loc[(X_solar.index >= "2014-01-01 00:00:00") & (X_solar.index <= "2015-12-31 23:00:00")]
y_solar_train = y_solar.loc[(X_solar.index >= "2014-01-01 00:00:00") & (X_solar.index <= "2015-12-31 23:00:00")]

# We'll test the models on the data for the year 2016
X_solar_test = X_solar.loc[(X_solar.index >= "2016-01-01 00:00:00") & (X_solar.index <= "2016-12-31 23:00:00")]
y_solar_test = y_solar.loc[(X_solar.index >= "2016-01-01 00:00:00") & (X_solar.index <= "2016-12-31 23:00:00")]

In [10]:
lr = LinearRegression()
scores_solar = cross_val_score(lr, X_solar_train, y_solar_train, cv=5)
print(scores_solar, "\naverage =", np.mean(scores_solar))

[0.92022942 0.94526696 0.89305223 0.94928705 0.93194901] 
average = 0.927956935710218


In [11]:
lr.fit(X_solar_train, y_solar_train)
lr.intercept_, lr.coef_

(-3776.972889151058,
 array([ 8.85484115e-02,  4.42633986e+00,  1.64625752e+00, -5.03135295e-01,
         2.66524792e+01, -8.25565970e-01, -1.48210563e+03, -9.71281996e+02,
         8.82987154e+02,  9.01959433e+00]))

In [12]:
lr.fit(X_solar_train, y_solar_train)
predictions_lr = lr.predict(X_solar_test)
r2 = r2_score(y_solar_test, predictions_lr)
print(f"The R2 score of the model is {r2}")

The R2 score of the model is 0.9554751324719775


In [16]:
dt = DecisionTreeRegressor(random_state=0)
scores_solar = cross_val_score(dt, X_solar_train, y_solar_train, cv=5)
print(scores_solar, "\naverage =", np.mean(scores_solar))

[0.89772249 0.92806966 0.85832778 0.9343828  0.91221965] 
average = 0.9061444781542078


In [17]:
dt.fit(X_solar_train, y_solar_train)
predictions_dt = dt.predict(X_solar_test)
r2 = r2_score(y_solar_test, predictions_dt)
print(f"The R2 score of the model is {r2}")

The R2 score of the model is 0.9209429209161143


In [20]:
rf = RandomForestRegressor(n_estimators=200)
scores_solar = cross_val_score(rf, X_solar_train, y_solar_train, cv=5)
print(scores_solar, "\naverage =", np.mean(scores_solar))

[0.93862668 0.96171983 0.90975115 0.9626873  0.94278864] 
average = 0.9431147203036963


In [21]:
rf.fit(X_solar_train, y_solar_train)
predictions_rf = rf.predict(X_solar_test)
r2 = r2_score(y_solar_test, predictions_rf)
print(f"The R2 score of the model is {r2}")

The R2 score of the model is 0.9585270235788436


In [22]:
rf = RandomForestRegressor(n_estimators=100)
scores_solar = cross_val_score(rf, X_solar_train, y_solar_train, cv=5)
print(scores_solar, "\naverage =", np.mean(scores_solar))

[0.93715733 0.96167035 0.90992225 0.96305269 0.94270146] 
average = 0.9429008141371806


In [25]:
# https://blog.datadive.net/selecting-good-features-part-iii-random-forests/
# rf.fit(X_solar_train, y_solar_train)

print("Features sorted by their score for the random forest regression model:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X_solar_train.columns), reverse=True))

Features sorted by their score:
[(0.9479, 'irradiance_surface'), (0.0158, 'precipitation'), (0.0079, 'solar_capacity'), (0.0075, 'air_density'), (0.0046, 'windspeed_10m'), (0.0042, 'radiation_direct_horizontal'), (0.0042, 'radiation_diffuse_horizontal'), (0.0041, 'cloud_cover'), (0.0028, 'irradiance_toa'), (0.0011, 'snowfall')]


In [32]:
# rf.fit(X_wind_train, y_wind_train)

print("Features sorted by their score for the random forest regression model:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X_wind_train.columns), reverse=True))

Features sorted by their score for the random forest regression model:
[(0.7915, 'windspeed_10m'), (0.0855, 'wind_capacity'), (0.0475, 'radiation_diffuse_horizontal'), (0.0164, 'precipitation'), (0.0154, 'air_density'), (0.0111, 'cloud_cover'), (0.0106, 'radiation_direct_horizontal'), (0.0093, 'irradiance_toa'), (0.0069, 'snowfall'), (0.0059, 'irradiance_surface')]
