In [1]:
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np



# Almacenar en caché los resultados de funciones en el disco
# ==============================================================================
import joblib


# Gestion de librerias
# ==============================================================================
from importlib import reload


# Preprocesado y modelado
# ==============================================================================
from scipy.stats import pearsonr
from scipy import stats
import math

#Separar los datos entrenamiento y prueba
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate


#Escalar Variables
from sklearn.preprocessing import MinMaxScaler

#Evaluación del modelo
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

#Creación de modelo
from sklearn.svm import SVR

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

from funciones import multiple_plot


# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
d=pd.read_csv('3_Fuel_Consumption_2000-2022_Prep.csv')

In [3]:
catCols = d.select_dtypes(include = ["object", 'category']).columns.tolist()
numCols=d.select_dtypes(include = ['float64','float64','int32','int64']).columns.tolist()
numCols.remove('EMISSIONS')

In [4]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

print(d.EMISSIONS.describe(percentiles = [0.25,0.50,0.75,0.85,0.90,0.95,0.975,1]))

count   14093
mean      246
std        56
min        96
25%       207
50%       239
75%       281
85%       308
90%       324
95%       347
97.5%     365
100%      404
max       404
Name: EMISSIONS, dtype: float64


In [5]:
d =pd.get_dummies(d, drop_first=1)
X = d.drop(columns = "EMISSIONS")
y = d['EMISSIONS']

In [6]:

X_train, X_test, y_train, y_test = train_test_split(
                                        X,
                                        y.values.reshape(-1,1),
                                        train_size   = 0.8,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [7]:
num_vars = numCols
pd.set_option('display.float_format', lambda x: '%.4f' % x)

# Se crea un objeto MinMaxScaler
scaler = MinMaxScaler()

# Se escalan los valores del dataset entrenamiento y prueba de las columnas numéricas
X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
X_test[num_vars] = scaler.transform(X_test[num_vars])

X_train[num_vars].head(2)

Unnamed: 0,YEAR,ENGINE SIZE,CYLINDERS,FUEL CONSUMPTION,KMXGALON
2582,0.5909,0.3649,0.4,0.3629,0.2281
2320,0.6364,0.3514,0.4,0.3544,0.2807


In [8]:
SVMR_linear    = SVR(kernel='linear' , C = 1000)
SVMR_Pol2   = SVR(kernel='poly'   , C = 1000, degree = 2)
SVMR_Pol3   = SVR(kernel='poly'   , C = 1000, degree = 3)
SVMR_rbf   = SVR(kernel='rbf'    , C = 1000)
SVMR_sig   = SVR(kernel='sigmoid', C = 1000)


SVMR_linear.fit(X_train, y_train)
SVMR_Pol2.fit(X_train, y_train)
SVMR_Pol3.fit(X_train, y_train)
SVMR_rbf.fit(X_train, y_train)
SVMR_sig.fit(X_train, y_train)


0,1,2
,kernel,'sigmoid'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1000
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [9]:
print('Linear   Training: ', SVMR_linear.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_linear.predict(X_test)))
print('Poly 2   Training: ', SVMR_Pol2.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_Pol2.predict(X_test)))
print('Poly 3   Training: ', SVMR_Pol3.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_Pol3.predict(X_test)))
print('rbf      Training: ', SVMR_rbf.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_rbf.predict(X_test)))
print('sigmoide Training: ', SVMR_sig.score(X_train,y_train), 'Test:',r2_score(y_test,SVMR_sig.predict(X_test)))


Linear   Training:  0.9409428101025018 Test: 0.9213221356188092
Poly 2   Training:  0.9921760220659768 Test: 0.9919820107334475
Poly 3   Training:  0.995850780374826 Test: 0.9947791239134242
rbf      Training:  0.9965682163517631 Test: 0.9957319679119846
sigmoide Training:  -18384886.09561539 Test: -19226020.38013874


In [10]:
# Nuevamente se asigna a 'X_Completo' y 'y_Completo' los datos entrada y salida respectivamente
X_Completo = X
y_Completo = y

# Se escalan los datos numéricos de entrada
X_Completo[num_vars] = scaler.fit_transform(X_Completo[num_vars])

In [11]:
joblib.dump(scaler, 'scaler/minmaxSVM.pkl')

['scaler/minmaxSVM.pkl']

In [12]:
#Se entena el modelo con múltiples hiperparámetros buscando el mejor

#Se crea una instancia de una SVR Regression
modelsvr = SVR()

CV = 10

#Se establecen los hiperparámetros para el Kernel lineal
svm_linear = {'C': [0.1, 1, 10, 100, 1000],
              'kernel': ['linear']}

#Se establecen los hiperparámetros para el Kernel polinomico
svm_poly = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 'auto', 'scale'],
              'degree': [2, 3],
              'kernel': ['poly']}

#Se establecen los hiperparámetros para los kernels rbf y sigmoid
svm_others = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001,'auto', 'scale'],
              'kernel': [ 'rbf', 'sigmoid']}

#Se unen todos los hiperparametros en una lista de diccionarios
parameters = [svm_linear, svm_poly, svm_others]


# Se define un GridSearchCV con una validacion cruzada de 10 pliegues y con todos los
#hierparametros establecidos anteriormente
grid_svr = GridSearchCV(modelsvr,
                        param_grid=parameters,
                        cv=CV,
                        verbose=3)

#Se entrena el GridSearchCV anterior (2 min. aprox.) con los datos completos
grid_svr.fit(X_Completo,y_Completo)

Fitting 10 folds for each of 125 candidates, totalling 1250 fits
[CV 1/10] END .............C=0.1, kernel=linear;, score=0.859 total time=   4.0s
[CV 2/10] END .............C=0.1, kernel=linear;, score=0.861 total time=   4.0s
[CV 3/10] END .............C=0.1, kernel=linear;, score=0.866 total time=   4.0s
[CV 4/10] END .............C=0.1, kernel=linear;, score=0.861 total time=   4.0s
[CV 5/10] END .............C=0.1, kernel=linear;, score=0.860 total time=   4.0s
[CV 6/10] END .............C=0.1, kernel=linear;, score=0.855 total time=   4.0s
[CV 7/10] END .............C=0.1, kernel=linear;, score=0.850 total time=   4.0s
[CV 8/10] END .............C=0.1, kernel=linear;, score=0.857 total time=   3.9s
[CV 9/10] END .............C=0.1, kernel=linear;, score=0.867 total time=   4.0s
[CV 10/10] END ............C=0.1, kernel=linear;, score=0.865 total time=   4.0s
[CV 1/10] END ...............C=1, kernel=linear;, score=0.942 total time=   4.0s
[CV 2/10] END ...............C=1, kernel=lin

0,1,2
,estimator,SVR()
,param_grid,"[{'C': [0.1, 1, ...], 'kernel': ['linear']}, {'C': [0.1, 1, ...], 'degree': [2, 3], 'gamma': [1, 0.1, ...], 'kernel': ['poly']}, ...]"
,scoring,
,n_jobs,
,refit,True
,cv,10
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,kernel,'rbf'
,degree,3
,gamma,1
,coef0,0.0
,tol,0.001
,C,1000
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [13]:
# Obtener los resultados del grid search
results_grid_svr = pd.DataFrame(grid_svr.cv_results_)


# Seleccionar las columnas deseadas
columns_grid_svr = ['param_C']  + \
                    ['param_degree']  + \
                    ['param_kernel']  + \
                    ['param_gamma']   + \
                    ['mean_test_score', 'std_test_score']  + \
                    [f'split{i}_test_score' for i in range(CV)]

# Filtrar y mostrar los resultados
results_grid_svr_filtered = results_grid_svr[columns_grid_svr]

# Crear la columna scoreWithStd: R2 / std
results_grid_svr_filtered['scoreWithStd'] = results_grid_svr_filtered.apply(
    lambda row: row['mean_test_score'] / row['std_test_score'] if row['std_test_score'] != 0 else 0,
    axis=1
)

# Encuentra el índice del máximo valor en la columna scoreWithStd
indice_max_scoreWithStd = results_grid_svr_filtered['scoreWithStd'].idxmax()

# Mostrar los scores promedios por cada parámetro
print(results_grid_svr_filtered[['param_C', 'param_degree', 'mean_test_score', 'std_test_score', 'scoreWithStd']])


      param_C  param_degree  mean_test_score  std_test_score  scoreWithStd
0      0.1000           NaN           0.8602          0.0049      176.0359
1      1.0000           NaN           0.9465          0.0060      157.9630
2     10.0000           NaN           0.9397          0.0103       91.0094
3    100.0000           NaN           0.9368          0.0113       82.6355
4   1000.0000           NaN           0.9365          0.0114       81.8437
..        ...           ...              ...             ...           ...
120 1000.0000           NaN           0.9465          0.0060      157.9670
121 1000.0000           NaN           0.9906          0.0023      428.1268
122 1000.0000           NaN       -2559.4596        217.2934      -11.7788
123 1000.0000           NaN           0.9960          0.0004     2665.1072
124 1000.0000           NaN   -23607345.3251    1159905.0814      -20.3528

[125 rows x 5 columns]


In [14]:
# Se identifican los mejores hiperparámetros
best_params = grid_svr.best_params_
print('Best Parameters : ',best_params)

Best Parameters :  {'C': 1000, 'gamma': 1, 'kernel': 'rbf'}


In [15]:
registro_max_scoreWithStd = results_grid_svr_filtered.loc[indice_max_scoreWithStd]


In [17]:
# Usar los mejores parámetros para crear el modelo
modelsvr.set_params(kernel=results_grid_svr_filtered.loc[indice_max_scoreWithStd]['param_kernel']
                     , C = results_grid_svr_filtered.loc[indice_max_scoreWithStd]['param_C']
                     , gamma = results_grid_svr_filtered.loc[indice_max_scoreWithStd]['param_gamma']
                    )

modelsvr.fit(X_Completo, y_Completo)

0,1,2
,kernel,'rbf'
,degree,3
,gamma,1
,coef0,0.0
,tol,0.001
,C,np.float64(1000.0)
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [19]:
print(X_Completo.head(2))

    YEAR  ENGINE SIZE  CYLINDERS  FUEL CONSUMPTION  KMXGALON  \
0 0.3182       0.5811     0.6000            0.7089    0.0526   
1 0.9091       0.5811     0.6000            0.4937    0.1404   

   RANGE_CATEGORY_LOW_RANGE  RANGE_CATEGORY_MID_RANGE  \
0                     False                      True   
1                     False                      True   

   VEHICLE CLASS_SEDAN_COMPACT  VEHICLE CLASS_SUV_CROSSOVER  \
0                        False                        False   
1                        False                        False   

   VEHICLE CLASS_TRUCK_VAN  VEHICLE CLASS_WAGON_SPECIALTY  FUEL_X  FUEL_Z  
0                     True                          False   False   False  
1                     True                          False    True   False  


In [18]:
#Se guardan los modelos de SVC

joblib.dump(modelsvr, 'modelos/SVRModel.pkl')

['modelos/SVRModel.pkl']