In [1]:
# Librerias y configuración

import pandas as pd
import numpy as np
from tqdm import tqdm
from pycaret.regression import *
from sklearn.preprocessing import StandardScaler

In [2]:
# Cargar datos

df = pd.read_csv('../data/complete_data.csv')

df.head()

Unnamed: 0,fecha,intensidad,estacion,magnitud,unidad_medida,valores
0,2022-12-01 00:00:00,222.189757,Pza. de España,Dióxido de Azufre,ug/m3,1.0
1,2022-12-01 00:00:00,222.189757,Arturo Soria,Óxidos de Nitrógeno,ug/m3,18.0
2,2022-12-01 00:00:00,222.189757,Barajas,Dióxido de Nitrógeno,ug/m3,26.0
3,2022-12-01 00:00:00,222.189757,Barrio del Pilar,Óxidos de Nitrógeno,ug/m3,17.0
4,2022-12-01 00:00:00,222.189757,Pza. Castilla,Óxidos de Nitrógeno,ug/m3,16.0


In [3]:
# Se hace una lista con un df por cada contaminante para poder hacer un modelo por cada uno

# Primero se crea una columna para cada contaminante con su valor

df_magnitudes = df.pivot_table(index='fecha', columns='magnitud', values='valores')

df_magnitudes = df_magnitudes.fillna(method='ffill')

# Se elimminan las columnas magnitud y valores del df original porque ya no se necesitan

df = df.drop(columns=['magnitud', 'valores'])

# Se hace un merge con el df original

df = pd.merge(df, df_magnitudes, on='fecha')

# Se reduce el df para ajustar el número de datos a la capacidad de la máquina, en este caso 0.1% de los datos puesto que por ahora solo se dispone de datos de tráfico de Madrid y es una prueba, pero se debe cambiar en caso de ser posible

muestra_aleatoria = np.random.choice(df.index, size=int(len(df) * 0.00001), replace=False)

df = df.loc[muestra_aleatoria]

# Se hace un df con la media de contaminación en toda la ciudad, no por estación

df = df.groupby('fecha').mean().reset_index()

# Se crea la lista donde se guardarán los dataframes

dfs = []

# Se quiere hacer un ciclo for con cada columna de contaminante

for e in tqdm(df[['Benceno',
       'Dióxido de Azufre', 'Dióxido de Nitrógeno', 'Etilbenceno',
       'Hidrocarburos no metánicos', 'Hidrocarburos totales', 'Metano',
       'Monóxido de Carbono', 'Monóxido de Nitrógeno', 'Ozono',
       'Óxidos de Nitrógeno', 'Partículas < 10 um', 'Partículas < 2.5 um',
       'Tolueno']].columns):

    # Después se hace un df con la media de contaminación en la ciudad por cada contaminante

    variables = ['intensidad', e]   # Posibles variables explicativas + los valores del contaminante. Añafir aqui nuevas columnas si se añaden nuevas variables/datos

    df_magnitudes = df[variables]

    # Se añade el df a la lista de dataframes

    dfs.append(df_magnitudes)

100%|██████████| 14/14 [00:00<00:00, 858.01it/s]


In [4]:
dfs[0].head()

Unnamed: 0,intensidad,Benceno
0,873.370061,0.466667
1,772.634227,0.616667
2,750.996914,0.675
3,347.174069,0.25
4,77.778486,0.5


In [7]:
# Se usa pycaret para comparar los distintos modelos de regresión y conocer sus resultados

# Se crea un objeto de la clase StandardScaler para normalizar los datos

scaler = StandardScaler() 

for e in tqdm(dfs):

    print(f'CONTAMINANTE: {e.columns[1]}')   # Cambiar el número de columna en caso de añadir nuevas variables/datos

    # Se normalizan los datos

    e = pd.DataFrame(scaler.fit_transform(e))
    
    reg = setup(data=e, target=f'{e.columns[1]}', session_id=1)  # Cambiar el número de columna en caso de añadir nuevas variables/datos

    compare_models()

    print('\n\n\n\n\n\n') # Unos saltos de linea para que sea más legible

  0%|          | 0/14 [00:00<?, ?it/s]

CONTAMINANTE: Benceno


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.7136,1.4081,1.0007,-0.4311,0.4099,1.7677,0.012
llar,Lasso Least Angle Regression,0.8069,1.3237,0.9944,-0.5007,0.5001,1.3546,0.011
lasso,Lasso Regression,0.8069,1.3237,0.9944,-0.5007,0.5001,1.3546,0.019
dummy,Dummy Regressor,0.8069,1.3237,0.9944,-0.5007,0.5001,1.3546,0.011
en,Elastic Net,0.8069,1.3237,0.9944,-0.5007,0.5001,1.3546,0.015
lightgbm,Light Gradient Boosting Machine,0.7724,1.2612,0.9658,-0.5178,0.4051,2.1562,0.013
knn,K Neighbors Regressor,0.8171,1.3444,1.0283,-0.6703,0.3891,2.1593,0.013
br,Bayesian Ridge,0.8107,1.3305,1.0073,-0.7598,0.5101,1.7354,0.012
ridge,Ridge Regression,0.7983,1.3241,1.0035,-0.8785,0.4881,1.9297,0.015
lr,Linear Regression,0.7984,1.3256,1.0043,-0.8921,0.4876,1.9409,0.014


  7%|▋         | 1/14 [00:07<01:39,  7.68s/it]








CONTAMINANTE: Dióxido de Azufre


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.7865,1.2572,1.0059,-0.157,0.5569,1.5787,0.012
en,Elastic Net,0.7865,1.2572,1.0059,-0.157,0.5569,1.5787,0.012
ridge,Ridge Regression,0.7989,1.2801,1.018,-0.2192,0.4981,2.587,0.019
lr,Linear Regression,0.7992,1.2812,1.0185,-0.2218,0.4972,2.6183,0.016


 14%|█▍        | 2/14 [00:15<01:36,  8.03s/it]








CONTAMINANTE: Dióxido de Nitrógeno


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.8264,1.2185,0.9995,-0.377,0.5356,1.1367,0.016
en,Elastic Net,0.8264,1.2185,0.9995,-0.377,0.5356,1.1367,0.013
dummy,Dummy Regressor,0.8264,1.2185,0.9995,-0.377,0.5356,1.1367,0.012
llar,Lasso Least Angle Regression,0.8264,1.2185,0.9995,-0.377,0.5356,1.1367,0.012
lightgbm,Light Gradient Boosting Machine,0.8053,1.1606,0.9747,-0.9109,0.3846,2.4224,0.02
br,Bayesian Ridge,0.8522,1.2719,1.0391,-1.3162,0.5088,1.6224,0.012
knn,K Neighbors Regressor,0.8326,1.2427,1.0223,-1.4472,0.4,3.6938,0.013
huber,Huber Regressor,0.801,1.3248,1.0565,-1.4513,0.4581,1.5091,0.014
ridge,Ridge Regression,0.8429,1.2584,1.0305,-1.4639,0.4844,1.7712,0.012
lr,Linear Regression,0.8435,1.2615,1.0318,-1.4937,0.4837,1.7835,0.018


 21%|██▏       | 3/14 [00:24<01:28,  8.04s/it]








CONTAMINANTE: Etilbenceno


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.3592,1.3855,0.7045,-0.9361,0.2408,1.5991,0.015
lasso,Lasso Regression,0.4699,1.3507,0.7643,-9.4403,0.3193,1.3762,0.011
en,Elastic Net,0.4699,1.3507,0.7643,-9.4403,0.3193,1.3762,0.013
dummy,Dummy Regressor,0.4699,1.3507,0.7643,-9.4403,0.3193,1.3762,0.012
llar,Lasso Least Angle Regression,0.4699,1.3507,0.7643,-9.4403,0.3193,1.3762,0.013
br,Bayesian Ridge,0.4734,1.3508,0.7643,-9.4567,0.3196,1.3933,0.013
ridge,Ridge Regression,0.4735,1.3575,0.7705,-12.2586,0.3249,1.5003,0.013
lar,Least Angle Regression,0.4736,1.3578,0.7707,-12.3411,0.3249,1.5037,0.012
omp,Orthogonal Matching Pursuit,0.4736,1.3578,0.7707,-12.3411,0.3249,1.5037,0.011
lr,Linear Regression,0.4736,1.3578,0.7707,-12.3411,0.3249,1.5037,0.016


 29%|██▊       | 4/14 [00:31<01:19,  7.97s/it]








CONTAMINANTE: Hidrocarburos no metánicos


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.4276,1.6062,0.6802,-0.256,0.2699,2.5589,0.013
llar,Lasso Least Angle Regression,0.4799,1.6275,0.7373,-1.2209,0.2666,2.4461,0.012
lasso,Lasso Regression,0.4799,1.6275,0.7373,-1.2209,0.2666,2.4461,0.012
dummy,Dummy Regressor,0.4799,1.6275,0.7373,-1.2209,0.2666,2.4461,0.012
en,Elastic Net,0.4799,1.6275,0.7373,-1.2209,0.2666,2.4461,0.012
br,Bayesian Ridge,0.4985,1.6484,0.7718,-2.2869,0.3025,6.9538,0.013
lightgbm,Light Gradient Boosting Machine,0.5285,1.644,0.8024,-3.2797,0.2994,5.16,0.015
ridge,Ridge Regression,0.5135,1.6616,0.7996,-3.3356,0.305,9.3534,0.016
lr,Linear Regression,0.5147,1.6635,0.8015,-3.4055,0.3056,9.4956,0.014
lar,Least Angle Regression,0.5147,1.6635,0.8015,-3.4055,0.3056,9.4956,0.013


 36%|███▌      | 5/14 [00:39<01:11,  7.90s/it]








CONTAMINANTE: Hidrocarburos totales


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.6957,1.2235,1.0121,-0.3985,0.4027,0.6797,0.013
lightgbm,Light Gradient Boosting Machine,0.7734,1.1447,0.9773,-0.9425,0.4154,1.0821,0.015
lasso,Lasso Regression,0.7977,1.1939,1.0003,-0.9791,0.5746,0.9749,0.013
dummy,Dummy Regressor,0.7977,1.1939,1.0003,-0.9791,0.5746,0.9749,0.012
llar,Lasso Least Angle Regression,0.7977,1.1939,1.0003,-0.9791,0.5746,0.9749,0.011
en,Elastic Net,0.7977,1.1939,1.0003,-0.9791,0.5746,0.9749,0.012
br,Bayesian Ridge,0.7991,1.1863,0.9987,-1.0909,0.513,1.0386,0.013
ridge,Ridge Regression,0.8041,1.1745,0.9985,-1.174,0.4868,1.0904,0.014
lr,Linear Regression,0.8049,1.1755,0.9991,-1.1804,0.4857,1.0938,0.015
lar,Least Angle Regression,0.8049,1.1755,0.9991,-1.1804,0.4857,1.0938,0.012


 43%|████▎     | 6/14 [00:47<01:03,  7.92s/it]








CONTAMINANTE: Metano


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.7257,1.0968,1.0052,-1.2958,0.4181,1.151,0.012
knn,K Neighbors Regressor,0.7805,1.0053,0.9846,-3.0781,0.4391,1.5407,0.015
lightgbm,Light Gradient Boosting Machine,0.8091,0.9829,0.9673,-3.5494,0.4506,1.2669,0.013
lasso,Lasso Regression,0.8331,1.0047,0.9837,-4.0094,0.5797,1.0374,0.016
dummy,Dummy Regressor,0.8331,1.0047,0.9837,-4.0094,0.5797,1.0374,0.013
llar,Lasso Least Angle Regression,0.8331,1.0047,0.9837,-4.0094,0.5797,1.0374,0.012
en,Elastic Net,0.8331,1.0047,0.9837,-4.0094,0.5797,1.0374,0.012
br,Bayesian Ridge,0.8265,0.9994,0.9798,-4.0835,0.5443,1.1035,0.014
ridge,Ridge Regression,0.8179,0.9843,0.9711,-4.1857,0.5131,1.1677,0.012
lr,Linear Regression,0.8184,0.9846,0.9712,-4.1916,0.5121,1.1709,0.014


 50%|█████     | 7/14 [00:55<00:55,  7.89s/it]








CONTAMINANTE: Monóxido de Carbono


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.6752,1.4616,1.0226,-0.5795,0.3813,1.1581,0.018
llar,Lasso Least Angle Regression,0.7884,1.3749,1.0254,-0.7164,0.4953,1.3466,0.012
lasso,Lasso Regression,0.7884,1.3749,1.0254,-0.7164,0.4953,1.3466,0.013
dummy,Dummy Regressor,0.7884,1.3749,1.0254,-0.7164,0.4953,1.3466,0.012
en,Elastic Net,0.7884,1.3749,1.0254,-0.7164,0.4953,1.3466,0.012
br,Bayesian Ridge,0.8001,1.4073,1.0523,-1.2988,0.5051,1.6978,0.012
lightgbm,Light Gradient Boosting Machine,0.7965,1.4495,1.0638,-1.4607,0.3796,2.2457,0.016
ridge,Ridge Regression,0.7829,1.3943,1.047,-1.5156,0.4831,1.8545,0.014
lr,Linear Regression,0.783,1.3962,1.048,-1.539,0.4826,1.8631,0.013
lar,Least Angle Regression,0.783,1.3962,1.048,-1.539,0.4826,1.8631,0.013


Processing:   0%|          | 0/77 [00:00<?, ?it/s]

 57%|█████▋    | 8/14 [01:03<00:47,  7.98s/it]








CONTAMINANTE: Monóxido de Nitrógeno


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.4852,1.4993,0.9106,-0.6577,0.3092,0.6293,0.013
llar,Lasso Least Angle Regression,0.7129,1.4256,0.9545,-3.7639,0.4281,1.259,0.012
lasso,Lasso Regression,0.7129,1.4256,0.9545,-3.7639,0.4281,1.259,0.012
dummy,Dummy Regressor,0.7129,1.4256,0.9545,-3.7639,0.4281,1.259,0.012
en,Elastic Net,0.7129,1.4256,0.9545,-3.7639,0.4281,1.259,0.013
lightgbm,Light Gradient Boosting Machine,0.6516,1.4034,0.9542,-6.5863,0.3372,1.4539,0.016
knn,K Neighbors Regressor,0.6759,1.5062,1.0001,-8.568,0.3469,1.6487,0.014
br,Bayesian Ridge,0.715,1.4496,0.984,-10.0307,0.4456,1.3708,0.013
et,Extra Trees Regressor,0.7884,2.1955,1.3035,-12.259,0.4378,2.0442,0.046
ridge,Ridge Regression,0.7004,1.4408,0.9837,-12.7641,0.4234,1.418,0.013


 64%|██████▍   | 9/14 [01:11<00:39,  7.92s/it]








CONTAMINANTE: Ozono


Unnamed: 0,Description,Value
0,Session id,1
1,Target,1
2,Target type,Regression
3,Original data shape,"(98, 2)"
4,Transformed data shape,"(98, 2)"
5,Transformed train set shape,"(68, 2)"
6,Transformed test set shape,"(30, 2)"
7,Numeric features,1
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.8112,1.0536,0.9774,-0.3324,0.5798,1.1224,0.012
en,Elastic Net,0.8112,1.0536,0.9774,-0.3324,0.5798,1.1224,0.013
dummy,Dummy Regressor,0.8112,1.0536,0.9774,-0.3324,0.5798,1.1224,0.012
llar,Lasso Least Angle Regression,0.8112,1.0536,0.9774,-0.3324,0.5798,1.1224,0.013
br,Bayesian Ridge,0.8113,1.0538,0.9775,-0.3328,0.5797,1.121,0.012
ridge,Ridge Regression,0.8162,1.0667,0.9869,-0.3808,0.5698,1.0468,0.013
lr,Linear Regression,0.8163,1.067,0.9871,-0.3819,0.5695,1.0458,0.014
lar,Least Angle Regression,0.8163,1.067,0.9871,-0.3819,0.5695,1.0458,0.013
omp,Orthogonal Matching Pursuit,0.8163,1.067,0.9871,-0.3819,0.5695,1.0458,0.012
huber,Huber Regressor,0.8221,1.0693,0.9892,-0.3937,0.5651,1.4202,0.012


 71%|███████▏  | 10/14 [01:19<00:31,  7.91s/it]








CONTAMINANTE: Óxidos de Nitrógeno


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.5938,1.4395,0.9703,-1.2674,0.352,1.048,0.012
llar,Lasso Least Angle Regression,0.7421,1.3736,0.9721,-1.3758,0.457,1.2339,0.012
lasso,Lasso Regression,0.7421,1.3736,0.9721,-1.3758,0.457,1.2339,0.012
dummy,Dummy Regressor,0.7421,1.3736,0.9721,-1.3758,0.457,1.2339,0.012
en,Elastic Net,0.7421,1.3736,0.9721,-1.3758,0.457,1.2339,0.015
lightgbm,Light Gradient Boosting Machine,0.7141,1.3482,0.9687,-3.1153,0.3589,1.7557,0.015
br,Bayesian Ridge,0.7546,1.407,1.0084,-4.4096,0.4651,1.4258,0.013
knn,K Neighbors Regressor,0.7493,1.4415,1.015,-4.6228,0.3674,1.9853,0.013
ridge,Ridge Regression,0.7427,1.397,1.0044,-5.3273,0.4424,1.4942,0.012
lr,Linear Regression,0.7431,1.3995,1.0057,-5.4358,0.442,1.4993,0.013


 79%|███████▊  | 11/14 [01:27<00:23,  7.90s/it]








CONTAMINANTE: Partículas < 10 um


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,0.705,1.3956,1.0482,-0.2917,0.4342,3.1709,0.013
lasso,Lasso Regression,0.7599,1.2964,1.0346,-0.3234,0.4874,1.5254,0.012
en,Elastic Net,0.7599,1.2964,1.0346,-0.3234,0.4874,1.5254,0.014
dummy,Dummy Regressor,0.7599,1.2964,1.0346,-0.3234,0.4874,1.5254,0.011
llar,Lasso Least Angle Regression,0.7599,1.2964,1.0346,-0.3234,0.4874,1.5254,0.012
br,Bayesian Ridge,0.774,1.3313,1.0498,-0.3799,0.4869,1.4943,0.015
ridge,Ridge Regression,0.7637,1.3257,1.0487,-0.4496,0.4895,1.5882,0.012
lar,Least Angle Regression,0.764,1.3269,1.0493,-0.4541,0.4893,1.5911,0.017
omp,Orthogonal Matching Pursuit,0.764,1.3269,1.0493,-0.4541,0.4893,1.5911,0.012
lr,Linear Regression,0.764,1.3269,1.0493,-0.4541,0.4893,1.5911,0.013


100%|██████████| 14/14 [01:51<00:00,  7.94s/it]











