In [1]:
# Tratamiento de datos
import pandas as pd
# Almacenar en caché los resultados de funciones en el disco
import joblib
# Metricas para evaluación del modelo
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
# Creación de modelo de maquinas de soporte vectorial
from sklearn.svm import SVR
# Separar los datos entrenamiento y prueba
from sklearn.model_selection import train_test_split
# Optimización de hiperparametros
from sklearn.model_selection import GridSearchCV
# Uso de validacion cruzada
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [2]:
# Exportamos los datos del archivo dataSetLimpio4 dado que tiene algunas manipulaciones de las columnas en una clasificacion categorica, eliminamos valores nulos, contaminación 0.5 y quitamos registros atipicos
limpiaDataSetCalidadAire = pd.read_csv('data/stage/dataSetLimpio4.csv')
limpiaDataSetCalidadAire.shape

(19447, 25)

In [3]:
# Revisamos el comportamiento por toma de muestras de los datos y usando variación entre columnas.
# Nota: Se estaba presentando buen comportamiento entre algunos meses del dataset, por ende,
# revisamos algunos registros por meses para identificar diferenciaciones.
# Finalmente, dejamos el dataset completo con la línea siguiente:
limpiaDataSetCalidadAire =limpiaDataSetCalidadAire.sample(n = 19447, random_state=123).iloc[:,:]
limpiaDataSetCalidadAire

Unnamed: 0,anio,mes,dia,hora,pm25,festivo,temperatura,humedad,presion,p1,...,dia_semana_Inicio Semana,dia_semana_Media semana,estacion_Estacion Aranjuez,estacion_Estacion Belen,estacion_Estacion Caldas,estacion_Estacion Itagui,franja_horaria_Madrugada,franja_horaria_Mañana,franja_horaria_Noche,franja_horaria_Tarde
11216,2024,2,27,1,30,0,19,80,852,0,...,1,0,1,0,0,0,1,0,0,0
8084,2024,7,8,4,9,0,15,86,825,0,...,1,0,0,0,1,0,1,0,0,0
1474,2024,3,14,1,31,0,22,84,0,0,...,0,1,0,0,0,1,1,0,0,0
12353,2024,4,24,11,15,0,27,51,851,0,...,1,0,1,0,0,0,0,1,0,0
3915,2024,8,15,1,25,0,20,84,0,0,...,0,1,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13435,2024,6,12,19,10,0,23,72,851,0,...,1,0,1,0,0,0,0,0,1,0
7763,2024,6,23,6,11,0,15,88,827,0,...,0,0,0,0,1,0,0,1,0,0
15377,2024,9,6,21,27,0,23,67,850,0,...,0,1,1,0,0,0,0,0,1,0
17730,2024,5,22,10,29,0,27,61,852,0,...,1,0,0,1,0,0,0,1,0,0


In [4]:
# Separamos los datos de variables independientes y la variable dependiente
X = limpiaDataSetCalidadAire.drop(columns ='pm25')
y = limpiaDataSetCalidadAire['pm25']
# Separamos las columnas de las variables independientes de la dependiente
x_cols = X.columns.tolist()

In [5]:
# División de los datos en datos de entrenamiento del modelo, y datos para el testeo
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.7, # 70 % de nuestros datos en entrenamiento y 30% de los datos para el testeo
    random_state=1234,
    shuffle=True
)

In [6]:
#Escalar Variables númericas
pd.set_option('display.float_format', lambda x: '%.4f' % x)
scaler = joblib.load('data/stage/estandarizacionDataSet4.pkl')

# Se escalan los valores del dataset entrenamiento y prueba
X_train[x_cols] = scaler.fit_transform(X_train[x_cols])
X_test[x_cols] = scaler.transform(X_test[x_cols])
# Probamos diferentes kernel con hiperparametros para revisar sus evaluaciones
SVMR_linear = SVR(kernel='linear', C = 1000)
SVMR_Pol2 = SVR(kernel='poly', C = 100, degree = 2)
SVMR_Pol3 = SVR(kernel='poly', C = 100, degree = 3)
SVMR_rbf = SVR(kernel='rbf', C = 100)  # Disminuimos el parametro de regularizacion C para que no se tenga tanta diferencia entre las metricas de evaluación de datos de entrenamiento y testeo
# SVMR_sig = SVR(kernel='sigmoid', C = 1000)  # Observamos que el kernel sigmoide no refleja resultados esperados y lo excluimos para el analisis
# Entrenamos los modelos, dado que tenemos gran cantidad de datos toma un tiempo para la ejecución
SVMR_linear.fit(X_train, y_train)
SVMR_Pol2.fit(X_train, y_train)
SVMR_Pol3.fit(X_train, y_train)
SVMR_rbf.fit(X_train, y_train)
#SVMR_sig.fit(X_train, y_train)
print('Linear   Training: ', SVMR_linear.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_linear.predict(X_test)))
print('Poly 2   Training: ', SVMR_Pol2.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_Pol2.predict(X_test)))
print('Poly 3   Training: ', SVMR_Pol3.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_Pol3.predict(X_test)))
print('rbf      Training: ', SVMR_rbf.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_rbf.predict(X_test)))
#print('sigmoide Training: ', SVMR_sig.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_sig.predict(X_test)))

Linear   Training:  0.1773573465251853 Test: 0.18930319587891553
Poly 2   Training:  0.3047927153853718 Test: 0.2998748858889463
Poly 3   Training:  0.4229661581884916 Test: 0.38931370249483677
rbf      Training:  0.4465240185131295 Test: 0.376663326638621


In [9]:
# luego de la ejecución de los modelos con los hiperparametros predeterminados usamos validación cruzada en 5 pliegues
cross_val_score(SVMR_rbf, X, y, cv=5, scoring='r2')

array([0.05387946, 0.04918886, 0.06417908, 0.07200873, 0.0563894 ,
       0.0724363 , 0.06993513, 0.05193737, 0.04847333, 0.06387322])

Tunning Afinación del modelo

In [10]:
# Nuevamente se asigna a 'X_Completo' y 'y_Completo' los datos entrada y salida respectivamente
X_completo = X
y_completo = y

# Se escalan los datos numéricos de entrada
X_completo[x_cols] = scaler.transform(X_completo[x_cols])
X_completo

Unnamed: 0,anio,mes,dia,hora,festivo,temperatura,humedad,presion,p1,velocidad_prom,...,dia_semana_Inicio Semana,dia_semana_Media semana,estacion_Estacion Aranjuez,estacion_Estacion Belen,estacion_Estacion Caldas,estacion_Estacion Itagui,franja_horaria_Madrugada,franja_horaria_Mañana,franja_horaria_Noche,franja_horaria_Tarde
11216,0.0000,0.1250,0.8667,0.0435,0.0000,0.3182,0.7791,0.9953,0.0000,0.0000,...,1.0000,0.0000,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000
8084,0.0000,0.7500,0.2333,0.1739,0.0000,0.1364,0.8488,0.9638,0.0000,0.0000,...,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000,1.0000,0.0000,0.0000,0.0000
1474,0.0000,0.2500,0.4333,0.0435,0.0000,0.4545,0.8256,0.0000,0.0000,0.1667,...,0.0000,1.0000,0.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.0000,0.0000
12353,0.0000,0.3750,0.7667,0.4783,0.0000,0.6818,0.4419,0.9942,0.0000,0.1667,...,1.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000
3915,0.0000,0.8750,0.4667,0.0435,0.0000,0.3636,0.8256,0.0000,0.0000,0.1667,...,0.0000,1.0000,0.0000,0.0000,0.0000,1.0000,1.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13435,0.0000,0.6250,0.3667,0.8261,0.0000,0.5000,0.6860,0.9942,0.0000,0.3333,...,1.0000,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000
7763,0.0000,0.6250,0.7333,0.2609,0.0000,0.1364,0.8721,0.9661,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000,1.0000,0.0000,0.0000
15377,0.0000,1.0000,0.1667,0.9130,0.0000,0.5000,0.6279,0.9930,0.0000,0.3333,...,0.0000,1.0000,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000,1.0000,0.0000
17730,0.0000,0.5000,0.7000,0.4348,0.0000,0.6818,0.5581,0.9953,0.0000,0.0000,...,1.0000,0.0000,0.0000,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000,0.0000


In [9]:
#Se entena el modelo con múltiples hiperparámetros para encontrar el mejor que se adapte

#Se crea una instancia de una SVR Regression
modelsvr = SVR()

# Usamos 5 pliegues
CV = 5

#Se establecen los hiperparámetros para el Kernel lineal
svm_linear = {'C': [0.1, 1],
              'kernel': ['linear']}

#Se establecen los hiperparámetros para el Kernel polinomico
svm_poly = {'C': [0.1, 1],
              'gamma': [1, 0.1, 'auto', 'scale'],
              'degree': [2, 3],
              'kernel': ['poly']}

#Se establecen los hiperparámetros para los kernels rbf
svm_others = {'C': [0.1, 1],
              'gamma': [1, 0.1,'auto', 'scale'],
              'kernel': [ 'rbf']}

#Se unen todos los hiperparametros en una lista de diccionarios
parameters = [svm_linear, svm_poly, svm_others]

# Se define un GridSearchCV con una validacion cruzada de 5 pliegues y con todos los
#hierparametros establecidos anteriormente
grid_svr = GridSearchCV(modelsvr, param_grid=parameters, cv=CV, verbose=3)

#Se entrena el GridSearchCV anterior (2 min. aprox.) con los datos completos
grid_svr.fit(X_completo,y_completo)

Fitting 5 folds for each of 26 candidates, totalling 130 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.162 total time=  16.0s
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.185 total time=  16.0s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.186 total time=  16.2s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.189 total time=  16.5s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.165 total time=  15.8s
[CV 1/5] END ................C=1, kernel=linear;, score=0.163 total time=  15.9s
[CV 2/5] END ................C=1, kernel=linear;, score=0.186 total time=  18.3s
[CV 3/5] END ................C=1, kernel=linear;, score=0.187 total time=  16.0s
[CV 4/5] END ................C=1, kernel=linear;, score=0.190 total time=  15.8s
[CV 5/5] END ................C=1, kernel=linear;, score=0.166 total time=  16.1s
[CV 1/5] END C=0.1, degree=2, gamma=1, kernel=poly;, score=0.235 total time=  17.1s
[CV 2/5] END C=0.1, degree=2, gamma=1, kerne

In [10]:
# Obtener los resultados del grid search en un dataframe para manipular
results_grid_svr = pd.DataFrame(grid_svr.cv_results_)

# Seleccionar las columnas deseadas
columns_grid_svr = ['param_C']  + \
                    ['param_degree']  + \
                    ['param_kernel']  + \
                    ['param_gamma']   + \
                    ['mean_test_score', 'std_test_score']  + \
                    [f'split{i}_test_score' for i in range(CV)]

# Filtrar y mostrar los resultados
results_grid_svr_filtered = results_grid_svr[columns_grid_svr]

# Crear la columna scoreWithStd: f1 / std
results_grid_svr_filtered['scoreWithStd'] = results_grid_svr_filtered.apply(
    lambda row: row['mean_test_score'] / row['std_test_score'] if row['std_test_score'] != 0 else 0,
    axis=1
)

# Encuentra el índice del máximo valor en la columna scoreWithStd
indice_max_scoreWithStd = results_grid_svr_filtered['scoreWithStd'].idxmax()

# Mostrar los scores promedios por cada parámetro
print(results_grid_svr_filtered[['param_C', 'param_degree', 'mean_test_score', 'std_test_score', 'scoreWithStd']])

    param_C  param_degree  mean_test_score  std_test_score  scoreWithStd
0    0.1000           NaN           0.1775          0.0113       15.7298
1    1.0000           NaN           0.1783          0.0114       15.6482
2    0.1000        2.0000           0.2535          0.0134       18.8535
3    0.1000        2.0000           0.1551          0.0092       16.8080
4    0.1000        2.0000           0.0634          0.0076        8.3689
5    0.1000        2.0000           0.2095          0.0110       18.9986
6    0.1000        3.0000           0.3173          0.0152       20.9040
7    0.1000        3.0000           0.1297          0.0087       14.9089
8    0.1000        3.0000           0.0088          0.0068        1.3019
9    0.1000        3.0000           0.2285          0.0116       19.7519
10   1.0000        2.0000           0.2827          0.0153       18.5327
11   1.0000        2.0000           0.2156          0.0114       18.9777
12   1.0000        2.0000           0.1750         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_grid_svr_filtered['scoreWithStd'] = results_grid_svr_filtered.apply(


In [11]:
# Se identifican los mejores hiperparámetros
best_params = grid_svr.best_params_
print('Best Parameters : ',best_params)

Best Parameters :  {'C': 1, 'degree': 3, 'gamma': 1, 'kernel': 'poly'}


In [12]:
# Top 3 de los mejores scores
results_grid_svr_filtered.sort_values(by='scoreWithStd', ascending=False).head(3)[
    ['param_C', 'param_degree', 'param_kernel','param_gamma','mean_test_score', 'std_test_score', 'scoreWithStd']]

Unnamed: 0,param_C,param_degree,param_kernel,param_gamma,mean_test_score,std_test_score,scoreWithStd
14,1.0,3.0,poly,1,0.372,0.013,28.5417
22,1.0,,rbf,1,0.2727,0.0116,23.55
6,0.1,3.0,poly,1,0.3173,0.0152,20.904


In [13]:
# Seleccionar el registro correspondiente al índice máximo
registro_max_scoreWithStd = results_grid_svr_filtered.loc[indice_max_scoreWithStd]

registro_max_scoreWithStd.transpose()

Unnamed: 0,14
param_C,1.0000
param_degree,3.0000
param_kernel,poly
param_gamma,1
mean_test_score,0.3720
std_test_score,0.0130
split0_test_score,0.3549
split1_test_score,0.3935
split2_test_score,0.3781
split3_test_score,0.3674


In [14]:
# Se crea un modelo SVR con los mejores hiperparámetros
modelsvr = SVR( kernel = best_params["kernel"]
                    , gamma = best_params["gamma"]
                    , C= best_params["C"]
                    , degree = best_params["degree"])


#Se entrena el modelo con los datos completos
modelsvr.fit(X_completo, y_completo)

In [15]:
#Se guardan los modelos de SVC

joblib.dump(modelsvr, 'data/analytics/modelosPM25/SVR_CV_varios_cal_aire_pm25.pkl')