# Modelo

## Librerias

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# Importar widgets interactivos para Jupyter Notebook
import ipywidgets as widgets

# Importar funciones de pycaret para regresión
from pycaret.regression import *

import warnings
warnings.filterwarnings('ignore')

## Cargar base de datos limpia

In [2]:
df = pd.read_csv('../database/clean/tetuan-pc.csv')
df.head()

Unnamed: 0,datetime,temp,humi,ws,gdf,df,pc
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,70425.53544
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,69320.84387
2,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,67803.22193
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,65489.23209
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,63650.44627


## Comparación de modelos

### Intento 1

In [3]:
data_train = df.sample(frac=0.8, random_state=42)
data_test = df.drop(data_train.index)

data_train = data_train.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

print('Data para entrenamiento: ' + str(data_train.shape))
print('Data para test: ' + str(data_test.shape))

Data para entrenamiento: (41933, 7)
Data para test: (10483, 7)


In [4]:
s = setup(data = data_train, target = 'pc', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pc
2,Target type,Regression
3,Original data shape,"(41933, 7)"
4,Transformed data shape,"(41933, 7)"
5,Transformed train set shape,"(29353, 7)"
6,Transformed test set shape,"(12580, 7)"
7,Numeric features,5
8,Categorical features,1
9,Preprocess,True


In [5]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,12244.4622,233264962.8207,15270.677,0.2134,0.2106,0.1743,0.039
et,Extra Trees Regressor,13961.6443,296692404.4188,17223.3082,-0.0001,0.2414,0.2069,0.481
lightgbm,Light Gradient Boosting Machine,13964.4339,296741645.9791,17224.7387,-0.0002,0.2415,0.207,0.433
lr,Linear Regression,13962.5941,296758100.5186,17225.2127,-0.0003,0.2414,0.2069,0.183
omp,Orthogonal Matching Pursuit,13962.5941,296758100.5186,17225.2127,-0.0003,0.2414,0.2069,0.023
gbr,Gradient Boosting Regressor,13959.6647,296754273.3563,17225.0968,-0.0003,0.2414,0.2068,0.491
rf,Random Forest Regressor,13962.6718,296759230.1665,17225.2456,-0.0003,0.2414,0.2069,0.976
dt,Decision Tree Regressor,13962.6608,296760785.9724,17225.2923,-0.0003,0.2414,0.2069,0.037
br,Bayesian Ridge,13962.5941,296758100.5186,17225.2127,-0.0003,0.2414,0.2069,0.025
llar,Lasso Least Angle Regression,13962.5941,296758100.5186,17225.2127,-0.0003,0.2414,0.2069,0.023


In [6]:
print(best)

HuberRegressor()


In [7]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [8]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,12059.4098,228730883.8643,15123.8515,0.2216,0.2083,0.1712


Unnamed: 0,datetime,temp,humi,ws,gdf,df,pc,prediction_label
40902,2017-12-02 02:00:00,17.379999,55.660000,0.080,0.029000,0.119000,48193.578125,72690.375819
6053,2017-05-08 07:50:00,14.330000,82.000000,0.085,268.799988,260.000000,57167.246094,59403.920831
12784,2017-03-29 04:40:00,15.100000,61.330002,4.921,0.059000,0.085000,53386.851562,70071.324410
21624,2017-09-20 08:30:00,21.090000,93.800003,4.917,36.970001,31.440001,61012.296875,65558.308856
1,2017-12-27 17:10:00,15.920000,54.439999,0.085,19.120001,19.340000,78957.937500,71501.304802
...,...,...,...,...,...,...,...,...
37453,2017-06-03 16:30:00,23.059999,66.360001,0.069,608.500000,104.199997,76517.000000,71725.548289
7271,2017-05-15 22:30:00,18.459999,85.699997,4.920,0.055000,0.126000,85952.210938,65819.530642
4290,2017-09-17 22:00:00,21.330000,76.300003,4.914,0.048000,0.089000,83338.164062,71447.270854
11269,2017-06-21 14:50:00,26.799999,55.900002,0.067,828.000000,81.400002,82411.687500,77644.355899


In [9]:
prediction = predict_model(best, data = data_test)
prediction.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,12035.6055,223225098.4807,14940.7195,0.2188,0.2077,0.1727


Unnamed: 0,datetime,temp,humi,ws,gdf,df,pc,prediction_label
0,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,67803.21875,56318.6544
1,2017-01-01 01:30:00,5.491,77.300003,0.082,0.062,0.111,57012.921875,54673.869967
2,2017-01-01 01:40:00,5.516,77.5,0.081,0.051,0.108,55970.648438,54636.306964
3,2017-01-01 02:00:00,5.059,78.599998,0.081,0.07,0.096,54290.738281,53861.532589
4,2017-01-01 02:40:00,5.02,79.699997,0.081,0.051,0.134,51161.382812,53486.982585


### Intento 2

In [10]:
data_train = df[['temp', 'humi', 'ws','pc']].sample(frac=0.8, random_state=42)
data_test = df[['temp', 'humi', 'ws','pc']].drop(data_train.index)

data_train = data_train.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

print('Data para entrenamiento: ' + str(data_train.shape))
print('Data para test: ' + str(data_test.shape))

Data para entrenamiento: (41933, 4)
Data para test: (10483, 4)


In [11]:
s = setup(data = data_train, target = 'pc', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pc
2,Target type,Regression
3,Original data shape,"(41933, 4)"
4,Transformed data shape,"(41933, 4)"
5,Transformed train set shape,"(29353, 4)"
6,Transformed test set shape,"(12580, 4)"
7,Numeric features,3
8,Preprocess,True
9,Imputation type,simple


In [12]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,9553.7292,164276645.7794,12815.1397,0.4463,0.1796,0.1384,0.617
et,Extra Trees Regressor,9558.8662,167363204.4917,12935.3162,0.4358,0.181,0.1383,0.354
lightgbm,Light Gradient Boosting Machine,10307.4627,173093121.6282,13155.0721,0.4165,0.185,0.1499,0.384
gbr,Gradient Boosting Regressor,10861.5154,187687233.934,13698.8491,0.3672,0.1924,0.1581,0.268
knn,K Neighbors Regressor,10931.1774,205533496.0,14335.1111,0.3072,0.2002,0.1578,0.012
ada,AdaBoost Regressor,12431.1749,219704387.7707,14822.1324,0.259,0.2132,0.1893,0.183
lar,Least Angle Regression,12110.2559,222847830.4,14926.7746,0.2488,0.208,0.1756,0.006
br,Bayesian Ridge,12110.3394,222847803.2,14926.7736,0.2488,0.208,0.1756,0.008
llar,Lasso Least Angle Regression,12110.2525,222847600.0,14926.7668,0.2488,0.208,0.1756,0.007
ridge,Ridge Regression,12110.2565,222847825.6,14926.7744,0.2488,0.208,0.1756,0.008


In [13]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [14]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,9457.5063,161954493.1227,12726.1343,0.4489,0.1781,0.1369


Unnamed: 0,temp,humi,ws,pc,prediction_label
40902,17.379999,55.660000,0.080,48193.578125,66646.665664
6053,14.330000,82.000000,0.085,57167.246094,52976.649023
12784,15.100000,61.330002,4.921,53386.851562,64394.278711
21624,21.090000,93.800003,4.917,61012.296875,55693.754321
1,15.920000,54.439999,0.085,78957.937500,77690.502461
...,...,...,...,...,...
37453,23.059999,66.360001,0.069,76517.000000,80867.574961
7271,18.459999,85.699997,4.920,85952.210938,76990.545547
4290,21.330000,76.300003,4.914,83338.164062,65278.692695
11269,26.799999,55.900002,0.067,82411.687500,88095.433281


In [15]:
prediction = predict_model(best, data = data_test)
prediction.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,9358.0126,156555656.9486,12512.2203,0.4521,0.1765,0.1364


Unnamed: 0,temp,humi,ws,pc,prediction_label
0,6.313,74.5,0.08,67803.21875,63411.428359
1,5.491,77.300003,0.082,57012.921875,55419.465039
2,5.516,77.5,0.081,55970.648438,56275.499336
3,5.059,78.599998,0.081,54290.738281,52240.666289
4,5.02,79.699997,0.081,51161.382812,52079.960508


In [16]:
#save_model(best, '../model/modelo_tetuan_pc')

In [17]:
prediction.to_csv('../database/predictions/prediction.csv', index=False)

### Intento 3

In [18]:
data_train = df[['temp', 'humi','pc']].sample(frac=0.8, random_state=42)
data_test = df[['temp', 'humi', 'pc']].drop(data_train.index)

data_train = data_train.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

print('Data para entrenamiento: ' + str(data_train.shape))
print('Data para test: ' + str(data_test.shape))

Data para entrenamiento: (41933, 3)
Data para test: (10483, 3)


In [19]:
s = setup(data = data_train, target = 'pc', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,pc
2,Target type,Regression
3,Original data shape,"(41933, 3)"
4,Transformed data shape,"(41933, 3)"
5,Transformed train set shape,"(29353, 3)"
6,Transformed test set shape,"(12580, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


In [20]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,11107.1182,194893087.6025,13959.3533,0.3429,0.1955,0.1614,0.419
gbr,Gradient Boosting Regressor,11326.4875,201071421.9318,14178.9142,0.3221,0.1987,0.1648,0.252
rf,Random Forest Regressor,11363.7909,219183612.4512,14803.5245,0.2611,0.2067,0.1645,0.487
knn,K Neighbors Regressor,11457.0505,220139323.2,14835.4372,0.2579,0.2073,0.1655,0.009
ada,AdaBoost Regressor,12480.51,222727903.3441,14923.3923,0.2489,0.2135,0.1892,0.185
en,Elastic Net,12113.2092,222847795.2,14926.768,0.2488,0.208,0.1757,0.007
lar,Least Angle Regression,12109.7986,222832641.6,14926.2624,0.2488,0.208,0.1756,0.008
br,Bayesian Ridge,12109.8596,222832662.4,14926.2631,0.2488,0.208,0.1756,0.008
llar,Lasso Least Angle Regression,12109.8033,222832652.8,14926.2627,0.2488,0.208,0.1756,0.007
ridge,Ridge Regression,12109.7986,222832646.4,14926.2627,0.2488,0.208,0.1756,0.007


In [21]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,11083.9298,194717220.4639,13954.1112,0.3374,0.1952,0.1611


Unnamed: 0,temp,humi,pc,prediction_label
40902,17.379999,55.660000,48193.578125,69271.516120
6053,14.330000,82.000000,57167.246094,68339.805022
12784,15.100000,61.330002,53386.851562,70513.423846
21624,21.090000,93.800003,61012.296875,58219.700352
1,15.920000,54.439999,78957.937500,75086.244834
...,...,...,...,...
37453,23.059999,66.360001,76517.000000,75485.352590
7271,18.459999,85.699997,85952.210938,64614.262040
4290,21.330000,76.300003,83338.164062,75651.486680
11269,26.799999,55.900002,82411.687500,80608.628881


In [23]:
prediction = predict_model(best, data = data_test)
prediction.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,10999.4741,189452091.6313,13764.1597,0.337,0.1946,0.1615


Unnamed: 0,temp,humi,pc,prediction_label
0,6.313,74.5,67803.21875,53587.433437
1,5.491,77.300003,57012.921875,52944.234137
2,5.516,77.5,55970.648438,54268.325675
3,5.059,78.599998,54290.738281,52944.234137
4,5.02,79.699997,51161.382812,51730.839989


Segun el analisis de las metricas, el modelo 2: Random Forest Regressor parece ser el mejor de los tres. Tiene e MAE, MSE, RMSE, R2, RMSLE y MAPE mas bajos, lo que indica un mejor rendimiento general en terminos de precisión y capacidad para explicar la variabilidad en los datos.