In [5]:
import pandas as pd
import numpy as np
import time
import math
import random

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor   
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt
from scipy.stats import norm

from SimulatedAnnealing import SimulatedAny
from BayersOptimlization import Bayes_Optimalizator

from sklearn.gaussian_process import GaussianProcessRegressor

import warnings
warnings.filterwarnings("ignore")

## Loading the data + preprocessing (detailed version can be found at TPOT notebook)

In [7]:

#loading and split into data and target
path = 'Dataset/wind_train_data.csv'
dataset = pd.read_csv(path)
y=dataset['windmill_generated_power(kW/h)']
X=dataset.drop(['windmill_generated_power(kW/h)'],axis=1)


#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

combi = pd.concat([X_train, y_train],
                  axis = 1)

#drop data that have missing value in target
combi1=combi.dropna(subset=['windmill_generated_power(kW/h)'])

train_target=combi1['windmill_generated_power(kW/h)']
train_data=combi1.drop(['windmill_generated_power(kW/h)'],axis=1)

#handle with missing values
categorical, numerical = [], []

for i in train_data.columns:

    if train_data[i].dtype == 'object':
        categorical.append(i)
    else:
        numerical.append(i)

for i in categorical:
    train_data[i].fillna(train_data[i].mode()[0], inplace=True)
for i in numerical:
    train_data[i].fillna(train_data[i].median(), inplace=True)
    
    label_encoder = LabelEncoder()
for i in categorical:
    train_data[i] = label_encoder.fit_transform(train_data[i])

#scale the data and endoce the labels
scaler = StandardScaler()

scaler.fit(train_data)
train_data_s = pd.DataFrame(scaler.fit_transform(train_data),columns = train_data.columns)


combi2 = pd.concat([X_test, y_test],
                  axis = 1)
values=combi2.values

combi3=combi2.dropna(subset=['windmill_generated_power(kW/h)'])

test_target=combi3['windmill_generated_power(kW/h)']
test_data=combi3.drop(['windmill_generated_power(kW/h)'],axis=1)

categorical, numerical = [], []

for i in test_data.columns:

    if test_data[i].dtype == 'object':
        categorical.append(i)
    else:
        numerical.append(i)

for i in categorical:
    test_data[i].fillna(test_data[i].mode()[0], inplace=True)
for i in numerical:
    test_data[i].fillna(test_data[i].median(), inplace=True)

  
label_encoder = LabelEncoder()
for i in categorical:
    test_data[i] = label_encoder.fit_transform(test_data[i])


test_data_s = pd.DataFrame(scaler.fit_transform(test_data),columns = test_data.columns)

train_data_s = train_data_s.iloc[: , 2:]
test_data_s = test_data_s.iloc[: , 2:]

reset = train_target.reset_index()
train_target = reset.iloc[: , 1: ]
train_target

reset = test_target.reset_index()
test_target = reset.iloc[: , 1: ]
test_target


Unnamed: 0,windmill_generated_power(kW/h)
0,4.261228
1,10.338474
2,2.773405
3,5.949011
4,9.549057
...,...
8400,5.585221
8401,5.722610
8402,5.423440
8403,5.621037


# MODELS

## Parameter ranges

In [3]:
SVR_parameters_ranges = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 
                         'C': [0.00001,0.0001,0.001,0.01,0.1,1], 'gamma': ['scale','auto']}

RFR_parameters_ranges = {'max_depth': np.arange(1,70), 'n_estimators': np.arange(1,70), 
                    'criterion': ['mae','mse'], 'max_features' : ["sqrt","log2"]}

KNNR_parameters_ranges = {'n_neighbors':np.arange(1,50), 'weights' : ['uniform', 'distance'], 
                          'algorithm': ['ball_tree', 'kd_tree', 'brute'],'leaf_size': np.arange(1,50),
                         'metric' : ['euclidean', 'manhattan', 'minkowski']}

### Objective functions

In [4]:
def objective_function_random_forest(parameters,pd_train_data_X, pd_train_data_Y):

    Forest_regre = RandomForestRegressor(**parameters, random_state=0)
    scores = cross_val_score(Forest_regre, pd_train_data_X, pd_train_data_Y.values.ravel(), cv=3,scoring='neg_root_mean_squared_error')
    return scores.mean()

def objective_function_SVC(parameters,pd_train_data_X, pd_train_data_Y):
    sv = SVR(**parameters)
    scores = cross_val_score(sv, pd_train_data_X, pd_train_data_Y.values.ravel(), cv=3,scoring='neg_root_mean_squared_error')
    return scores.mean()

def objective_function_KNN(parameters,pd_train_data_X, pd_train_data_Y):
    knn_clas = KNeighborsRegressor(**parameters)
    scores = cross_val_score(knn_clas, pd_train_data_X, pd_train_data_Y.values.ravel(), cv=3,scoring='neg_root_mean_squared_error')
    return scores.mean()

### Bayesian optimalization extra 

In [5]:
def acquisition_Function(random_param,real_X,real_Y,surrogate):
    #get the best parameters so far
    score_pred = surrogate.predict(real_X)
    best = max(real_Y)

            
    #predict accurcy 
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        mu, std = surrogate.predict(random_param, return_std=True)
  
        mu = mu[:, 0]
    
        probs = norm.cdf((mu - best) / (std))
    
    ix = np.argmax(probs)
    
    return random_param[ix]

pr_model = GaussianProcessRegressor(random_state=0)

## Optimization

In [6]:
best_candidates_SA = []
best_scores_SA = []
best_metro_SA = []
times_SA = []

best_SA = []

best_candidates_Bay = []
best_score_Bay = []
times_Bay = []

best_Bay = []

In [8]:
Train_scaled_X = train_data_s
Test_scaled_X = test_data_s
y_train = train_target
y_test= test_target

In [None]:
for callss in range(5):
    SAs = [SimulatedAny(objective_function_random_forest,RFR_parameters_ranges),
       SimulatedAny(objective_function_SVC, SVR_parameters_ranges),
       SimulatedAny(objective_function_KNN,KNNR_parameters_ranges)]

    Bays = [Bayes_Optimalizator(objective_function_random_forest,pr_model,acquisition_Function,RFR_parameters_ranges),
            Bayes_Optimalizator(objective_function_SVC,pr_model,acquisition_Function,SVR_parameters_ranges),
            Bayes_Optimalizator(objective_function_KNN,pr_model,acquisition_Function,KNNR_parameters_ranges)]
    

    ids = 1

    for id_x,x in enumerate(SAs):
        st_time = time.time()
        scor,candi,metro = x.run(100,5,100,Train_scaled_X,y_train)
        so_time = time.time()

        all_t = so_time - st_time
        times_SA.append(all_t)

        best_candidates_SA.append(candi[-1])
        best_SA.append(scor[-1])
        best_scores_SA.append(scor)
        best_metro_SA.append(metro)

        plt.figure()
        plt.plot(best_scores_SA[-1])
        plt.xlabel('Iterations')
        plt.ylabel('n-RMSe')
        plt.grid()
        mod = ['SA-RFR','SA-SVR','SA-KNN']
        plt.title('{}__TIME-{}__BEST-SCORE-{}\n{}'.format(mod[id_x],times_SA[-1],best_SA[-1],best_candidates_SA[-1]))
        plt.savefig('{}-call_{}.Model_{}_score={}_time{}.png'.format(callss,ids,mod[id_x],best_SA[-1],times_SA[-1])) 
        ids += 1




    for id_x,x in enumerate(Bays):
        st_time = time.time()
        score,par,best = x.optimaze(100,Train_scaled_X,y_train)
        so_time = time.time()

        all_t = so_time - st_time
        times_Bay.append(all_t)

        best_candidates_Bay.append(par[-1])
        best_score_Bay.append(best)    

        best_Bay.append(best[-1])

        plt.figure()
        plt.plot(best_score_Bay[-1])
        plt.xlabel('Iterations')
        plt.ylabel('n-RMSe')
        plt.grid()
        mod = ['Bay-RFR','BA-SVR','BA-KNN']
        plt.title('{}__TIME-{}__BEST-SCORE-{}\n{}'.format(mod[id_x],times_Bay[-1],best_Bay[-1],best_candidates_Bay[-1]))
        plt.savefig('{}-call_{}.Model_{}_score={}_time{}.png'.format(callss,ids,mod[id_x],best_Bay[-1],times_Bay[-1]))
        ids += 1            



## Evaluation

In [None]:
select_SA_BEST = np.argmax(best_SA)
select_BA_BEST = np.argmax(best_Bay)

model = 0
param = 0

if best_SA[select_SA_BEST] > best_Bay[select_BA_BEST]:
    model = select_SA_BEST
    param = best_candidates_SA[select_SA_BEST]
else:
    model = select_BA_BEST
    param =  best_candidates_Bay[select_BA_BEST]

In [12]:
def evaluate_best(id_mod, pd_train_X,pd_train_Y,pd_test_X,pd_test_Y,parameters):
    if id_mod == 0:
        model = RandomForestRegressor(**parameters, random_state=0)
    elif id_mod == 1:
        model = SVR(**parameters)
    elif id_mod == 2:
        model =KNeighborsRegressor(**parameters)
        
    res = model.fit(pd_train_X,pd_train_Y.values.ravel())
    
    pred = res.predict(pd_test_X)
    
    return -np.sqrt(mean_squared_error(pd_test_Y.values.ravel(),pred))

In [None]:
print(evaluate_best(model,Train_scaled_X,y_train,Test_scaled_X,y_test,param))