In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import io
import random
import requests

### Preprocessamento

In [3]:
df = pd.read_csv('Bias_correction_ucl.csv')

In [4]:
df.head()

Unnamed: 0,station,Date,Present_Tmax,Present_Tmin,LDAPS_RHmin,LDAPS_RHmax,LDAPS_Tmax_lapse,LDAPS_Tmin_lapse,LDAPS_WS,LDAPS_LH,...,LDAPS_PPT2,LDAPS_PPT3,LDAPS_PPT4,lat,lon,DEM,Slope,Solar radiation,Next_Tmax,Next_Tmin
0,1.0,2013-06-30,28.7,21.4,58.255688,91.116364,28.074101,23.006936,6.818887,69.451805,...,0.0,0.0,0.0,37.6046,126.991,212.335,2.785,5992.895996,29.1,21.2
1,2.0,2013-06-30,31.9,21.6,52.263397,90.604721,29.850689,24.035009,5.69189,51.937448,...,0.0,0.0,0.0,37.6046,127.032,44.7624,0.5141,5869.3125,30.5,22.5
2,3.0,2013-06-30,31.6,23.3,48.690479,83.973587,30.091292,24.565633,6.138224,20.57305,...,0.0,0.0,0.0,37.5776,127.058,33.3068,0.2661,5863.555664,31.1,23.9
3,4.0,2013-06-30,32.0,23.4,58.239788,96.483688,29.704629,23.326177,5.65005,65.727144,...,0.0,0.0,0.0,37.645,127.022,45.716,2.5348,5856.964844,31.7,24.3
4,5.0,2013-06-30,31.4,21.9,56.174095,90.155128,29.113934,23.48648,5.735004,107.965535,...,0.0,0.0,0.0,37.5507,127.135,35.038,0.5055,5859.552246,31.2,22.5


In [5]:
random.seed(42)

df.drop(columns=['Next_Tmin', 'Date'], inplace=True)
df.dropna(inplace=True)

SC = StandardScaler()
targets = df.iloc[:, 22:23]
X = df.iloc[:, 0:22].values
X = SC.fit_transform(X)


print('X:', X.shape)
print('targets:', targets.shape)


X: (7588, 22)
targets: (7588, 1)


In [6]:
X

array([[-1.66460654, -0.35331763, -0.74802891, ...,  2.76909065,
         1.11116193,  1.51056517],
       [-1.52605214,  0.72513836, -0.66472063, ..., -0.31582755,
        -0.54321979,  1.22299734],
       [-1.38749775,  0.62403311,  0.0433998 , ..., -0.52671876,
        -0.72389118,  1.20960169],
       ...,
       [ 1.38359018, -2.17321212, -2.53915704, ..., -0.85291936,
        -0.80453765, -2.09517477],
       [ 1.52214457, -2.17321212, -2.28923218, ..., -0.82147603,
        -0.75580008, -2.10667071],
       [ 1.66069897, -2.20691387, -2.41419461, ..., -0.7793405 ,
        -0.72010291, -2.07648676]])

In [7]:
targets

Unnamed: 0,Next_Tmax
0,29.1
1,30.5
2,31.1
3,31.7
4,31.2
...,...
7745,27.6
7746,28.0
7747,28.3
7748,28.6


### Linear 

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [9]:
validacao_Linear = cross_val_score(LinearRegression(), X, targets, scoring='neg_root_mean_squared_error')


In [10]:
validacao_Linear

array([-1.45728827, -1.63874781, -1.45413013, -1.62441096, -1.7131539 ])

In [11]:
# Menor valor
np.round(np.min(-validacao_Linear ),3)

1.454

In [12]:
# Media
np.round(np.mean(-validacao_Linear ),3)

1.578

In [13]:
resultados =pd.DataFrame({'resultados': ['Melhor alfa', 'Melhor score','default_score'], 'Linear': [0, 0, np.round(np.min(-validacao_Linear),3)] })

### Linear com regularização L1

In [14]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, uniform
from sklearn.linear_model import Lasso

In [15]:
param = {
    'alpha':loguniform(10e-3, 10e3)
}

In [16]:
rnd_search = RandomizedSearchCV(Lasso(), param, n_iter =10)
search = rnd_search .fit(X, targets)

validacao_L1 = cross_val_score(Lasso(), X, targets, scoring='neg_root_mean_squared_error')


In [17]:
print("Melhor alfa", rnd_search.best_params_['alpha'])
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_L1 ),3))

Melhor alfa 0.010046123069050877
Melhor score 0.72369932727955
default_score 1.736


In [18]:
resultados['L1'] =pd.DataFrame({ 'L1': [rnd_search.best_params_['alpha'], rnd_search.best_score_, np.round(np.min(-validacao_L1 ),3)] })

In [19]:
resultados

Unnamed: 0,resultados,Linear,L1
0,Melhor alfa,0.0,0.010046
1,Melhor score,0.0,0.723699
2,default_score,1.454,1.736


### Linear com regularização L2

In [20]:
from sklearn.linear_model import Ridge

In [21]:
param = {
    'alpha':loguniform(10e-3, 10e3)
}

In [22]:
rnd_search = RandomizedSearchCV(Ridge(), param, n_iter =10)
search = rnd_search .fit(X, targets)

validacao_L2 = cross_val_score(Ridge(), X, targets, scoring='neg_root_mean_squared_error')


In [23]:
print("Melhor alfa", rnd_search.best_params_['alpha'])
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_L2 ),3))

Melhor alfa 26.034788500141026
Melhor score 0.7220155596138834
default_score 1.454


In [24]:
resultados['L2'] = pd.DataFrame({ 'L2': [rnd_search.best_params_['alpha'], rnd_search.best_score_, np.round(np.min(-validacao_L2 ),3)] })

In [25]:
resultados

Unnamed: 0,resultados,Linear,L1,L2
0,Melhor alfa,0.0,0.010046,26.034789
1,Melhor score,0.0,0.723699,0.722016
2,default_score,1.454,1.736,1.454


### SVM Linear

In [26]:
from sklearn.svm import LinearSVR
import warnings
warnings.filterwarnings('ignore')

In [27]:
param = {
    'epsilon':[0.1, 0.3],
    'C': loguniform(2e-15,2e15)
    }

In [28]:
rnd_search = RandomizedSearchCV(LinearSVR(), param, n_iter =10)
search = rnd_search .fit(X, targets)

validacao_LinearSVR = cross_val_score(LinearSVR(), X, targets, scoring='neg_root_mean_squared_error')


In [29]:
print("Melhor parametros", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_LinearSVR ),3))

Melhor parametros {'C': 0.16410600069924242, 'epsilon': 0.1}
Melhor score 0.7264640501873735
default_score 1.424


In [30]:
resultados['LinearSVR'] = pd.DataFrame({ 'LinearSVR': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_LinearSVR ),3)] })

In [31]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}"
1,Melhor score,0.0,0.723699,0.722016,0.726464
2,default_score,1.454,1.736,1.454,1.424


### SVM com kernel RBF

In [65]:
from sklearn.svm import SVR

In [66]:
param = {
    'epsilon':[0.1, 0.3],
    'C':loguniform(2e-15,2e15),
    'gamma':loguniform(2e-9,2e3)
    }

In [67]:
rnd_search = RandomizedSearchCV(SVR(), param, n_iter =10)
search = rnd_search .fit(X, targets)

validacao_SVR = cross_val_score(SVR(), X, targets, scoring='neg_root_mean_squared_error')


In [68]:
print("Melhor parametros", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_SVR ),3))

Melhor parametros {'C': 3.753556948446938, 'epsilon': 0.1, 'gamma': 1.871747481086105e-05}
Melhor score 0.5764670468972979
default_score 1.502


In [69]:
resultados['SVR'] = pd.DataFrame({ 'SVR': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_SVR ),3)] })

In [70]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR,KNN,MLP,DT,RF,GBM,SVR
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}",{'n_neighbors': 24},{'hidden_layer_sizes': 23},{'ccp_alpha': 0.034775533051766463},"{'n_estimators': 1000, 'max_features': 10}","{'n_estimators': 37, 'max_features': 0.2875917...","{'C': 3.753556948446938, 'epsilon': 0.1, 'gamm..."
1,Melhor score,0.0,0.723699,0.722016,0.726464,0.622196,0.383365,0.617192,0.703717,0.69252,0.576467
2,default_score,1.454,1.736,1.454,1.424,1.634,1.854,2.153,1.429,1.438,1.502


### KNN

In [33]:
from sklearn.neighbors import KNeighborsRegressor

In [34]:
knn_params = {
    'n_neighbors':np.random.randint(1, 1000, 10)
    }

In [36]:
rnd_search = RandomizedSearchCV(KNeighborsRegressor(), knn_params, n_iter =1000)
search = rnd_search .fit(X, targets)

validacao_KNN = cross_val_score(KNeighborsRegressor(), X, targets, scoring='neg_root_mean_squared_error')


In [37]:
print("Melhor k", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_KNN),3))

Melhor k {'n_neighbors': 24}
Melhor score 0.6221955777470624
default_score 1.634


In [38]:
resultados['KNN'] = pd.DataFrame({ 'KNN': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_KNN),3)] })

In [39]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR,KNN
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}",{'n_neighbors': 24}
1,Melhor score,0.0,0.723699,0.722016,0.726464,0.622196
2,default_score,1.454,1.736,1.454,1.424,1.634


### MLP

In [40]:
from sklearn.neural_network import MLPRegressor

In [41]:
MLP_params = {
    'hidden_layer_sizes':(5,8,11,14,17,20,23)
    }

In [42]:
rnd_search = RandomizedSearchCV(MLPRegressor(), MLP_params, n_iter =10)
search = rnd_search .fit(X, targets)

validacao_MLP = cross_val_score(MLPRegressor(), X, targets, scoring='neg_root_mean_squared_error')


In [43]:
print("Melhor k", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_MLP ),3))

Melhor k {'hidden_layer_sizes': 23}
Melhor score 0.3833650415444818
default_score 1.854


In [44]:
resultados['MLP'] = pd.DataFrame({ 'MLP': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_MLP),3)] })

In [45]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR,KNN,MLP
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}",{'n_neighbors': 24},{'hidden_layer_sizes': 23}
1,Melhor score,0.0,0.723699,0.722016,0.726464,0.622196,0.383365
2,default_score,1.454,1.736,1.454,1.424,1.634,1.854


### Arvore de decisão

In [46]:
from sklearn.tree import DecisionTreeRegressor

In [47]:
dtree_params = {
    'ccp_alpha':[random.uniform(0.0, 0.4) for i in range(10)]
    }

In [48]:
rnd_search = RandomizedSearchCV(DecisionTreeRegressor(), dtree_params, n_iter =10)
search = rnd_search.fit(X, targets)

validacao_DT = cross_val_score(DecisionTreeRegressor(), X, targets, scoring='neg_root_mean_squared_error')


In [49]:
print("Melhor k", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_DT ),3))

Melhor k {'ccp_alpha': 0.034775533051766463}
Melhor score 0.6171919580564855
default_score 2.153


In [50]:
resultados['DT'] = pd.DataFrame({ 'DT': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_DT),3)] })

### Random Forest

In [51]:
from sklearn.ensemble import RandomForestRegressor

In [52]:
rf_params = {
    'n_estimators':[10, 100, 1000],
    'max_features':[5, 10, 22]
    }

In [53]:
rnd_search = RandomizedSearchCV(RandomForestRegressor(), rf_params, n_iter =10)
search = rnd_search.fit(X, targets)

validacao_RF = cross_val_score(RandomForestRegressor(), X, targets, scoring='neg_root_mean_squared_error')


In [55]:
print("Melhor k", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_RF),3))

Melhor k {'n_estimators': 1000, 'max_features': 10}
Melhor score 0.7037171823137351
default_score 1.429


In [56]:
resultados['RF'] = pd.DataFrame({ 'RF': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_RF),3)] })

In [57]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR,KNN,MLP,DT,RF
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}",{'n_neighbors': 24},{'hidden_layer_sizes': 23},{'ccp_alpha': 0.034775533051766463},"{'n_estimators': 1000, 'max_features': 10}"
1,Melhor score,0.0,0.723699,0.722016,0.726464,0.622196,0.383365,0.617192,0.703717
2,default_score,1.454,1.736,1.454,1.424,1.634,1.854,2.153,1.429


### GBM

In [58]:
from sklearn.ensemble import GradientBoostingRegressor

In [60]:
gbm_params = {
    'n_estimators': np.random.randint(5, 100, 10),
    'max_features':[random.uniform(0.01, 0.3) for i in range(10)],
    'max_depth':[2, 3]
    }

In [61]:
rnd_search = RandomizedSearchCV(GradientBoostingRegressor(), gbm_params , n_iter =10)
search = rnd_search.fit(X, targets)

validacao_GBM = cross_val_score(GradientBoostingRegressor(), X, targets, scoring='neg_root_mean_squared_error')


In [62]:
print("Melhor k", rnd_search.best_params_)
print("Melhor score", rnd_search.best_score_)
print("default_score", np.round(np.min(-validacao_GBM),3))

Melhor k {'n_estimators': 37, 'max_features': 0.28759179093996656, 'max_depth': 3}
Melhor score 0.6925203005825169
default_score 1.438


In [63]:
resultados['GBM'] = pd.DataFrame({ 'GBM': [rnd_search.best_params_, rnd_search.best_score_, np.round(np.min(-validacao_GBM),3)] })

In [71]:
resultados

Unnamed: 0,resultados,Linear,L1,L2,LinearSVR,KNN,MLP,DT,RF,GBM,SVR
0,Melhor alfa,0.0,0.010046,26.034789,"{'C': 0.16410600069924242, 'epsilon': 0.1}",{'n_neighbors': 24},{'hidden_layer_sizes': 23},{'ccp_alpha': 0.034775533051766463},"{'n_estimators': 1000, 'max_features': 10}","{'n_estimators': 37, 'max_features': 0.2875917...","{'C': 3.753556948446938, 'epsilon': 0.1, 'gamm..."
1,Melhor score,0.0,0.723699,0.722016,0.726464,0.622196,0.383365,0.617192,0.703717,0.69252,0.576467
2,default_score,1.454,1.736,1.454,1.424,1.634,1.854,2.153,1.429,1.438,1.502
