# Models
In this notebook, we'll start to explore what models we can use to predict the output of solar panels based on the weather data. We'll start with simple models such as linear regression and nearest neighbors, decision trees and then move on to more complex ones such as random forests, gradient boosting and SVMs.

## Imports

In [21]:
# Loop printing
from tqdm import tqdm

# Data management
import pandas as pd

# Test and train split and mean squared error metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Cross validation grid search
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

## Data Loading

In [22]:
df = pd.read_csv('../../data/no_outliers.csv', sep=';', index_col=1)
df = df.rename(columns={'Unnamed: 0': 'Timestamp'})
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour + df['Timestamp'].dt.minute / 60

df = df.drop(df.columns[9:19], axis=1)
df = df.drop(columns=['Timestamp'])
df

Unnamed: 0_level_0,Power_1,Power_2,Power_3,Power_4,Power_5,Power_6,Power_7,Power_8,SWD,SWDtop,...,TT2M,ST,RH2m,WS10m,WS100m,PREC,SNOW,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
577035.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.38,8.38,93.34,3.46,9.10,0.0,0.0,1,1,2.25
577038.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.52,8.52,92.18,3.81,9.52,0.0,0.0,1,1,2.50
577041.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.66,8.66,90.91,4.15,9.66,0.0,0.0,1,1,2.75
577044.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.76,8.76,90.18,4.36,9.50,0.0,0.0,1,1,3.00
577047.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.74,8.74,90.39,4.38,9.16,0.0,0.0,1,1,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667201.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.14,13.14,72.48,7.39,13.79,0.0,0.0,12,31,21.50
667204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.13,13.13,72.07,7.46,13.93,0.0,0.0,12,31,21.75
667207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.12,13.12,71.67,7.52,14.04,0.0,0.0,12,31,22.00
667210.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.10,13.10,71.36,7.55,14.10,0.0,0.0,12,31,22.25


In [23]:
target_cols = ['Power_1', 'Power_2', 'Power_3', 'Power_4', 'Power_5', 'Power_6', 'Power_7', 'Power_8']
target = df[target_cols]
target

Unnamed: 0_level_0,Power_1,Power_2,Power_3,Power_4,Power_5,Power_6,Power_7,Power_8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
577035.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577038.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577041.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577044.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
577047.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
667201.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667207.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667210.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
features = df.drop(columns=target_cols, axis=1)
features

Unnamed: 0_level_0,SWD,SWDtop,CU,CM,CD,TT2M,ST,RH2m,WS10m,WS100m,PREC,SNOW,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
577035.0,0.0,0.0,0.00,0.0,0.0,8.38,8.38,93.34,3.46,9.10,0.0,0.0,1,1,2.25
577038.0,0.0,0.0,0.00,0.0,0.0,8.52,8.52,92.18,3.81,9.52,0.0,0.0,1,1,2.50
577041.0,0.0,0.0,0.00,0.0,0.0,8.66,8.66,90.91,4.15,9.66,0.0,0.0,1,1,2.75
577044.0,0.0,0.0,0.00,0.0,0.0,8.76,8.76,90.18,4.36,9.50,0.0,0.0,1,1,3.00
577047.0,0.0,0.0,0.00,0.0,0.0,8.74,8.74,90.39,4.38,9.16,0.0,0.0,1,1,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667201.0,0.0,0.0,0.76,0.0,0.0,13.14,13.14,72.48,7.39,13.79,0.0,0.0,12,31,21.50
667204.0,0.0,0.0,1.00,0.0,0.0,13.13,13.13,72.07,7.46,13.93,0.0,0.0,12,31,21.75
667207.0,0.0,0.0,1.00,0.0,0.0,13.12,13.12,71.67,7.52,14.04,0.0,0.0,12,31,22.00
667210.0,0.0,0.0,1.00,0.0,0.0,13.10,13.10,71.36,7.55,14.10,0.0,0.0,12,31,22.25


## Splitting into Train, Validation and Test Sets

In [25]:
features_train, features_temp, target_train, target_temp = train_test_split(features, target, test_size=0.25, random_state=42)
features_val, features_test, target_val, target_test = train_test_split(features_temp, target_temp, test_size=0.5, random_state=42)

print('Training features shape:', features_train.shape)
print('Validation features shape:', features_val.shape)
print('Testing features shape:', features_test.shape)
print('Training target shape:', target_train.shape)
print('Validation target shape:', target_val.shape)
print('Testing target shape:', target_test.shape)

Training features shape: (21621, 15)
Validation features shape: (3603, 15)
Testing features shape: (3604, 15)
Training target shape: (21621, 8)
Validation target shape: (3603, 8)
Testing target shape: (3604, 8)


## Models Development

### Linear Regression

In [26]:
lr = []
predictions = []
mse = []

for i in tqdm(range(1, 9)):
    lr.append(LinearRegression().fit(features_train, target_train['Power_' + str(i)]))
    predictions.append(lr[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

lr_result = pd.DataFrame({'Model': 'Linear Regression', 'Power': range(1, 9), 'MSE': mse, 'Params': 'None'})
lr_result

100%|██████████| 8/8 [00:00<00:00, 38.42it/s]


Unnamed: 0,Model,Power,MSE,Params
0,Linear Regression,1,78274350.0,
1,Linear Regression,2,81645540.0,
2,Linear Regression,3,72152370.0,
3,Linear Regression,4,66577470.0,
4,Linear Regression,5,13143940.0,
5,Linear Regression,6,13363990.0,
6,Linear Regression,7,13879000.0,
7,Linear Regression,8,13747680.0,


### Nearest Neighbors

In [27]:
knn = []
knn_params = []
predictions = []
mse = []

params = {'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49]}

for i in tqdm(range(1, 9)):
    knn.append(GridSearchCV(KNeighborsRegressor(), param_grid=params, cv=5).fit(features_train, target_train['Power_' + str(i)]))
    knn_params.append(knn[i-1].best_params_)
    predictions.append(knn[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

knn_result = pd.DataFrame({'Model': 'KNN', 'Power': range(1, 9), 'MSE': mse, 'Params': knn_params})
knn_result

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [02:37<00:00, 19.64s/it]


Unnamed: 0,Model,Power,MSE,Params
0,KNN,1,68618730.0,{'n_neighbors': 19}
1,KNN,2,72129580.0,{'n_neighbors': 19}
2,KNN,3,62883490.0,{'n_neighbors': 19}
3,KNN,4,58112160.0,{'n_neighbors': 19}
4,KNN,5,11739570.0,{'n_neighbors': 19}
5,KNN,6,11931510.0,{'n_neighbors': 19}
6,KNN,7,12358320.0,{'n_neighbors': 19}
7,KNN,8,12242170.0,{'n_neighbors': 19}


### Decision Trees

In [28]:
dt = []
dt_params = []
predictions = []
mse = []

param_grid = {
    'max_depth': [None, 10, 20, 30], # Maximum depth of the tree
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],   # Minimum number of samples required at each leaf node
}

for i in tqdm(range(1, 9)):
    dt.append(GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid, cv=5).fit(features_train, target_train['Power_' + str(i)]))
    dt_params.append(dt[i-1].get_params())
    predictions.append(dt[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

dt_result = pd.DataFrame({'Model': 'Decision Tree', 'Power': range(1, 9), 'MSE': mse, 'Params': dt_params})
dt_result

100%|██████████| 8/8 [03:00<00:00, 22.61s/it]


Unnamed: 0,Model,Power,MSE,Params
0,Decision Tree,1,57665340.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
1,Decision Tree,2,58760030.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
2,Decision Tree,3,54790400.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
3,Decision Tree,4,50619350.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
4,Decision Tree,5,9955541.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
5,Decision Tree,6,9808800.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
6,Decision Tree,7,9924122.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."
7,Decision Tree,8,9355827.0,"{'cv': 5, 'error_score': nan, 'estimator__ccp_..."


### Random Forests

In [29]:
rf = []
rf_params = []
predictions = []
mse = []

for i in tqdm(range(1, 9)):
    rf.append(RandomForestRegressor(random_state=42).fit(features_train, target_train['Power_' + str(i)]))
    rf_params.append(rf[i-1].get_params())
    predictions.append(rf[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

rf_result = pd.DataFrame({'Model': 'Random Forest', 'Power': range(1, 9), 'MSE': mse, 'Params': rf_params})
rf_result

100%|██████████| 8/8 [01:37<00:00, 12.13s/it]


Unnamed: 0,Model,Power,MSE,Params
0,Random Forest,1,39713510.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
1,Random Forest,2,40293740.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
2,Random Forest,3,38531570.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
3,Random Forest,4,34131910.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
4,Random Forest,5,5681094.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
5,Random Forest,6,5751748.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
6,Random Forest,7,5713195.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
7,Random Forest,8,5731205.0,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."


### Gradient Boosting

In [30]:
gb = []
gb_params = []
predictions = []
mse = []

for i in tqdm(range(1, 9)):
    gb.append(GradientBoostingRegressor().fit(features_train, target_train['Power_' + str(i)]))
    gb_params.append(gb[i-1].get_params)
    predictions.append(gb[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

gb_result = pd.DataFrame({'Model': 'Gradient Boosting', 'Power': range(1, 9), 'MSE': mse, 'Params': gb_params})
gb_result

100%|██████████| 8/8 [00:31<00:00,  3.98s/it]


Unnamed: 0,Model,Power,MSE,Params
0,Gradient Boosting,1,57074350.0,<bound method BaseEstimator.get_params of Grad...
1,Gradient Boosting,2,60082440.0,<bound method BaseEstimator.get_params of Grad...
2,Gradient Boosting,3,53094170.0,<bound method BaseEstimator.get_params of Grad...
3,Gradient Boosting,4,48787160.0,<bound method BaseEstimator.get_params of Grad...
4,Gradient Boosting,5,9377617.0,<bound method BaseEstimator.get_params of Grad...
5,Gradient Boosting,6,9529992.0,<bound method BaseEstimator.get_params of Grad...
6,Gradient Boosting,7,9663547.0,<bound method BaseEstimator.get_params of Grad...
7,Gradient Boosting,8,9618469.0,<bound method BaseEstimator.get_params of Grad...


### SVMs

In [31]:
svm = []
svm_params = []
predictions = []
mse = []

for i in tqdm(range(1, 9)):
    svm.append(SVR().fit(features_train, target_train['Power_' + str(i)]))
    svm_params.append(svm[i-1].get_params)
    predictions.append(svm[i-1].predict(features_val))
    mse.append(mean_squared_error(target_val['Power_' + str(i)], predictions[i-1]))

svm_result = pd.DataFrame({'Model': 'Support Vector Machine', 'Power': range(1, 9), 'MSE': mse, 'Params': svm_params})
svm_result

100%|██████████| 8/8 [04:42<00:00, 35.28s/it]


Unnamed: 0,Model,Power,MSE,Params
0,Support Vector Machine,1,305399700.0,<bound method BaseEstimator.get_params of SVR()>
1,Support Vector Machine,2,307984300.0,<bound method BaseEstimator.get_params of SVR()>
2,Support Vector Machine,3,293360200.0,<bound method BaseEstimator.get_params of SVR()>
3,Support Vector Machine,4,259636200.0,<bound method BaseEstimator.get_params of SVR()>
4,Support Vector Machine,5,30020090.0,<bound method BaseEstimator.get_params of SVR()>
5,Support Vector Machine,6,30456250.0,<bound method BaseEstimator.get_params of SVR()>
6,Support Vector Machine,7,30601810.0,<bound method BaseEstimator.get_params of SVR()>
7,Support Vector Machine,8,30961050.0,<bound method BaseEstimator.get_params of SVR()>


### Choosing best models

In [32]:
best_models = pd.DataFrame({'Power': range(1,9)})
best_models['Model'] = None
best_models['MSE'] = None
best_models['Params'] = None

models = [lr_result, knn_result, dt_result, rf_result, gb_result, svm_result]
for i in range(1,9):
    best_model = min(models, key=lambda x: x['MSE'][i-1])
    best_models['Model'][i-1] = best_model['Model'][0]
    best_models['MSE'][i-1] = best_model['MSE'][i-1]
    best_models['Params'][i-1] = best_model['Params'][i-1]

best_models = best_models.set_index('Power')

best_models

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_models['Model'][i-1] = best_model['Model'][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_models['MSE'][i-1] = best_model['MSE'][i-1]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_models['Params'][i-1] = best_model['Params'][i-1]


Unnamed: 0_level_0,Model,MSE,Params
Power,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Random Forest,39713511.708409,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
2,Random Forest,40293743.440072,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
3,Random Forest,38531572.989163,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
4,Random Forest,34131912.849011,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
5,Random Forest,5681094.186281,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
6,Random Forest,5751747.979102,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
7,Random Forest,5713195.231311,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."
8,Random Forest,5731204.658512,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri..."


### Reporting the results with the test set

In [33]:
models = []
predictions = []
mse = []

for i in tqdm(range(1, 9)):
    if best_models['Model'][i] == 'Linear Regression':
        models.append(LinearRegression().fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))
    elif best_models['Model'][i] == 'KNN':
        models.append(KNeighborsRegressor(n_neighbors=best_models['Params'][i]['n_neighbors']).fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))
    elif best_models['Model'][i] == 'Decision Tree':
        models.append(DecisionTreeRegressor(max_depth=best_models['Params'][i]['max_depth'], min_samples_split=best_models['Params'][i]['min_samples_split'], min_samples_leaf=best_models['Params'][i]['min_samples_leaf']).fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))
    elif best_models['Model'][i] == 'Random Forest':
        models.append(RandomForestRegressor(random_state=42).fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))
    elif best_models['Model'][i] == 'Gradient Boosting':
        models.append(GradientBoostingRegressor().fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))
    elif best_models['Model'][i] == 'Support Vector Machine':
        models.append(SVR().fit(features_train, target_train['Power_' + str(i)]))
        predictions.append(models[i-1].predict(features_test))
        mse.append(mean_squared_error(target_test['Power_' + str(i)], predictions[i-1]))

100%|██████████| 8/8 [01:37<00:00, 12.16s/it]


In [34]:
for i in range(1, 9):
    print('Power', i)
    print('Model:', models[i-1])
    print('MSE:', mse[i-1], '\n')

Power 1
Model: RandomForestRegressor(random_state=42)
MSE: 37773626.81246615 

Power 2
Model: RandomForestRegressor(random_state=42)
MSE: 37800683.61297932 

Power 3
Model: RandomForestRegressor(random_state=42)
MSE: 36279486.25152606 

Power 4
Model: RandomForestRegressor(random_state=42)
MSE: 32713946.347253554 

Power 5
Model: RandomForestRegressor(random_state=42)
MSE: 5404401.459159212 

Power 6
Model: RandomForestRegressor(random_state=42)
MSE: 5457250.523247696 

Power 7
Model: RandomForestRegressor(random_state=42)
MSE: 5570287.713306326 

Power 8
Model: RandomForestRegressor(random_state=42)
MSE: 5471145.417032574 



In [35]:
# print the mse in exponential notation
print('MSE in exponential notation')
for i in range(1, 9):
    print(f'Power {i}: {mse[i-1]:.2e}')

MSE in exponential notation
Power 1: 3.78e+07
Power 2: 3.78e+07
Power 3: 3.63e+07
Power 4: 3.27e+07
Power 5: 5.40e+06
Power 6: 5.46e+06
Power 7: 5.57e+06
Power 8: 5.47e+06
