In [38]:
import pandas as pd
import numpy as np
# Modelos y métricas
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.decomposition import PCA
# Preprocsamiento de datos
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
# Figuras
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

# Creación de funciones
from typing import Optional, Tuple, Union

In [39]:
def procesar_categoricas(df, cat_features, drop_first=True):
    """
    Procesa variables categóricas con OneHotEncoder de sklearn.

    Parámetros
    ----------
    df : pd.DataFrame
        DataFrame original con variables numéricas y categóricas.
    cat_features : list
        Lista con los nombres de las columnas categóricas a codificar.
    drop_first : bool, opcional
        Si True, elimina una categoría base para evitar multicolinealidad.
        Por defecto True.

    Retorna
    -------
    X_encoded_df : pd.DataFrame
        DataFrame transformado con variables categóricas codificadas.
    preprocessor : ColumnTransformer
        Objeto ColumnTransformer que contiene el OneHotEncoder (útil para nuevos datos).
    """

    # Configurar OneHotEncoder
    encoder = OneHotEncoder(
        drop='first' if drop_first else None,
        sparse_output=False
    )

    # Crear ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', encoder, cat_features)
        ],
        remainder='passthrough'  # mantiene las columnas numéricas sin tocar
    )

    # Aplicar la transformación
    X_encoded = preprocessor.fit_transform(df[cat_features])

    # Obtener los nombres de las nuevas columnas
    encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_features)
    all_cols = list(encoded_cols)

    # Crear DataFrame final
    X_encoded_df = pd.DataFrame(X_encoded, columns=all_cols, index=df.index)

    return X_encoded_df, preprocessor

In [40]:
def procesar_numericas(df, num_features):
    """
    Escala variables numéricas usando StandardScaler de sklearn.

    Parámetros
    ----------
    df : pd.DataFrame
        DataFrame original con variables numéricas.
    num_features : list
        Lista con los nombres de las columnas numéricas a escalar.

    Retorna
    -------
    X_scaled_df : pd.DataFrame
        DataFrame transformado con las variables numéricas escaladas.
    preprocessor : ColumnTransformer
        Objeto ColumnTransformer con el escalador (útil para nuevos datos).
    """

    # Configurar el escalador
    scaler = StandardScaler()

    # Crear el transformador
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', scaler, num_features)
        ],
        remainder='passthrough'  # deja otras columnas sin modificar
    )

    # Aplicar transformación
    X_scaled = preprocessor.fit_transform(df[num_features])

    # Obtener nombres de columnas finales (mantiene el orden)
    scaled_cols = [f"{col}_scaled" for col in num_features]

    # Crear DataFrame transformado
    X_scaled_df = pd.DataFrame(X_scaled, columns=scaled_cols, index=df.index)

    return X_scaled_df, preprocessor

In [41]:
def pareto_categoria(data, column, top_n=None, cumulative_threshold=0.8, figsize=(10,6), show=True, return_table=False):
    """
    Calcula y (opcionalmente) grafica un diagrama de Pareto para una variable categórica.

    Parámetros:
        data (pd.DataFrame): DataFrame que contiene la columna a analizar.
        column (str): Nombre de la columna categórica.
        top_n (int, opcional): Si se especifica, muestra/usa solo las `top_n` categorías y agrupa el resto como 'Otros'.
        cumulative_threshold (float, opcional): umbral entre 0 y 1 para identificar cuántas
            categorías explican ese porcentaje acumulado. Solo usado informativamente si top_n es None.
        figsize (tuple): tamaño de la figura (ancho, alto).
        show (bool): si True muestra la gráfica; si False solo devuelve la tabla.
        return_table (bool): si True devuelve el DataFrame con conteos, porcentajes y acumulados.

    Retorna:
        Si return_table=True devuelve un pd.DataFrame con columnas ['category','count','percent','cumulative_percent'].
        Si return_table=False devuelve None (pero muestra la gráfica si show=True).

    Ejemplo:
        pareto_categoria(df, 'marca', top_n=10)
    """
    if column not in data.columns:
        raise ValueError(f"La columna '{column}' no existe en el DataFrame")

    # Contar incluyendo NaNs como categoría explícita
    ser = data[column].fillna('NaN').astype(str)
    counts = ser.value_counts(dropna=False)
    total = counts.sum()

    pareto_df = pd.DataFrame({
        'category': counts.index.astype(str),
        'count': counts.values
    })
    pareto_df['percent'] = pareto_df['count'] / total
    pareto_df['cumulative_percent'] = pareto_df['percent'].cumsum()

    # Información sobre el número de categorías necesarias para alcanzar el umbral
    if top_n is None and cumulative_threshold is not None:
        reached = pareto_df[pareto_df['cumulative_percent'] <= cumulative_threshold]
        # Si ningún valor está estrictamente <= threshold (por ej. primer > threshold), incluir el primero
        if reached.empty:
            needed = 1
        else:
            needed = len(reached)
    else:
        needed = top_n

    # Si se pide agrupar por top_n, crear una tabla para graficar
    if top_n is not None:
        top = pareto_df.head(top_n).copy()
        others_count = pareto_df['count'].iloc[top_n:].sum()
        if others_count > 0:
            others_percent = others_count / total
            others_cum = top['cumulative_percent'].iloc[-1] + others_percent
            top = top.append({'category': 'Otros', 'count': others_count, 'percent': others_percent, 'cumulative_percent': others_cum}, ignore_index=True)
        plot_df = top
    else:
        plot_df = pareto_df

    if show:
        fig, ax = plt.subplots(figsize=figsize)
        sns.barplot(x='category', y='count', data=plot_df, ax=ax, palette='Blues_d')
        ax.set_ylabel('Count')
        ax.set_xlabel(column)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

        # Eje secundario para porcentaje acumulado
        ax2 = ax.twinx()
        ax2.plot(range(len(plot_df)), plot_df['cumulative_percent'], color='red', marker='o')
        ax2.set_ylim(0, 1.05)
        ax2.set_ylabel('Cumulative percent')
        ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

        # Anotar barras con porcentaje
        for i, row in plot_df.reset_index().iterrows():
            ax.text(i, row['count'], f"{row['percent']:.1%}", ha='center', va='bottom', fontsize=9)

        plt.title(f'Pareto - {column}')
        plt.tight_layout()
        plt.show()

    if return_table:
        return pareto_df.reset_index(drop=True)

    return None

In [42]:
def aplicar_pca(
    X: Union[pd.DataFrame, np.ndarray],
    n_components: Optional[int] = None,
    variance_threshold: float = 0.95,
    random_state: Optional[int] = 42,
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, PCA]]:
    """
    Aplica PCA sobre X y devuelve el DataFrame transformado (componentes principales).
    
    Parámetros
    ----------
    X : pd.DataFrame | np.ndarray
        Datos de entrada (filas = muestras). Si es DataFrame se preservan índices.
    n_components : int | None
        Número de componentes a conservar. Si None, se calcula a partir de `variance_threshold`.
    variance_threshold : float
        Fracción de varianza explicada acumulada deseada si n_components es None (default 0.95).
    standardize : bool
        Si True se aplica StandardScaler antes de PCA (recomendado si X no está escalado).
    random_state : int | None
        Semilla para reproducibilidad.
    plot : bool
        Si True muestra gráfico de varianza explicada (scree + acumulada).
    return_model : bool
        Si True devuelve (df_pca, objeto_pca). Si False devuelve solo df_pca.

    Retorna
    -------
    pd.DataFrame o (pd.DataFrame, PCA)
        DataFrame con columnas ['PC1','PC2',...]. Si return_model=True también devuelve el objeto PCA ajustado.
    """
    # Preparar matriz de entrada
    if isinstance(X, pd.DataFrame):
        idx = X.index
        X_mat = X.values
    else:
        X_mat = np.asarray(X)
        idx = None


    # Determinar n_components si no se especifica
    if n_components is None:
        # Ajuste temporal para obtener varianza explicada
        pca_tmp = PCA(random_state=random_state)
        pca_tmp.fit(X_mat)
        cum_var = np.cumsum(pca_tmp.explained_variance_ratio_)
        # buscar el número mínimo de componentes que alcance el umbral
        n_components = int(np.searchsorted(cum_var, variance_threshold) + 1)

    # Ajustar PCA definitivo
    pca = PCA(n_components=n_components, random_state=random_state)
    X_pca = pca.fit_transform(X_mat)

    # Construir DataFrame resultado
    cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    df_pca = pd.DataFrame(X_pca, columns=cols, index=idx)
    
    return df_pca, pca
    

In [43]:
df = pd.read_csv('3_Fuel_Consumption_2000-2022_Prep.csv')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14093 entries, 0 to 14092
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   YEAR              14093 non-null  int64  
 1   RANGE_CATEGORY    14093 non-null  object 
 2   VEHICLE CLASS     14093 non-null  object 
 3   ENGINE SIZE       14093 non-null  float64
 4   CYLINDERS         14093 non-null  int64  
 5   FUEL              14093 non-null  object 
 6   FUEL CONSUMPTION  14093 non-null  float64
 7   KMXGALON          14093 non-null  float64
 8   EMISSIONS         14093 non-null  int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 991.0+ KB


In [44]:
catCols = df.select_dtypes(include = ["object", 'category']).columns.tolist()
df[catCols].head()

Unnamed: 0,RANGE_CATEGORY,VEHICLE CLASS,FUEL
0,MID_RANGE,TRUCK_VAN,O
1,MID_RANGE,TRUCK_VAN,X
2,MID_RANGE,SEDAN_COMPACT,X
3,MID_RANGE,MIDSIZE_FULLSIZE,Z
4,MID_RANGE,TRUCK_VAN,X


In [45]:
for col in catCols:
    print(col, df[col].nunique())

RANGE_CATEGORY 3
VEHICLE CLASS 5
FUEL 3


In [47]:
""" px.bar(
    df['MODEL'].value_counts().reset_index(),
    x='MODEL',
    y='count',
    title='Distribución de la variable MODEL',
    labels={'count':'Cantidad de vehículos', 'MODEL':'Modelo'},
    height=600
).update_layout(template = 'plotly_white') """

" px.bar(\n    df['MODEL'].value_counts().reset_index(),\n    x='MODEL',\n    y='count',\n    title='Distribución de la variable MODEL',\n    labels={'count':'Cantidad de vehículos', 'MODEL':'Modelo'},\n    height=600\n).update_layout(template = 'plotly_white') "

In [48]:
numCols=df.select_dtypes(include = ['float64','int32','int64']).columns.tolist()
df[numCols].head()

Unnamed: 0,YEAR,ENGINE SIZE,CYLINDERS,FUEL CONSUMPTION,KMXGALON,EMISSIONS
0,2007,5.3,8,21.0,24.1401,298
1,2020,5.3,8,15.9,32.1868,326
2,2001,2.0,4,9.6,53.10822,198
3,2018,2.3,4,12.2,41.84284,252
4,2005,5.6,8,17.4,30.57746,345


In [49]:
df[numCols].describe()

Unnamed: 0,YEAR,ENGINE SIZE,CYLINDERS,FUEL CONSUMPTION,KMXGALON,EMISSIONS
count,14093.0,14093.0,14093.0,14093.0,14093.0,14093.0
mean,2011.776485,3.225899,5.690272,12.450004,44.756164,245.756546
std,6.348677,1.268252,1.713495,3.192742,11.27517,55.560312
min,2000.0,1.0,2.0,4.2,19.31208,96.0
25%,2007.0,2.0,4.0,10.3,37.01482,207.0
50%,2012.0,3.0,6.0,12.0,43.45218,239.0
75%,2017.0,3.9,6.0,14.3,49.88954,281.0
max,2022.0,8.4,12.0,27.9,111.04446,404.0


In [50]:
px.scatter_matrix(df, dimensions=numCols).update_layout(
    title="Matriz de dispersión de variables numéricas",              
    height=1200
)

In [51]:

df[numCols].corr()

Unnamed: 0,YEAR,ENGINE SIZE,CYLINDERS,FUEL CONSUMPTION,KMXGALON,EMISSIONS
YEAR,1.0,-0.082838,-0.093893,-0.106884,0.094868,-0.054903
ENGINE SIZE,-0.082838,1.0,0.91844,0.834071,-0.761322,0.832865
CYLINDERS,-0.093893,0.91844,1.0,0.816396,-0.735515,0.820202
FUEL CONSUMPTION,-0.106884,0.834071,0.816396,1.0,-0.935042,0.930299
KMXGALON,0.094868,-0.761322,-0.735515,-0.935042,1.0,-0.921085
EMISSIONS,-0.054903,0.832865,0.820202,0.930299,-0.921085,1.0


In [52]:
df_cat, model_cat = procesar_categoricas(df, catCols)
df_num, model_num = procesar_numericas(df, numCols)

df_cat.shape, df_num.shape

((14093, 8), (14093, 6))

In [53]:
df_num.corr()

Unnamed: 0,YEAR_scaled,ENGINE SIZE_scaled,CYLINDERS_scaled,FUEL CONSUMPTION_scaled,KMXGALON_scaled,EMISSIONS_scaled
YEAR_scaled,1.0,-0.082838,-0.093893,-0.106884,0.094868,-0.054903
ENGINE SIZE_scaled,-0.082838,1.0,0.91844,0.834071,-0.761322,0.832865
CYLINDERS_scaled,-0.093893,0.91844,1.0,0.816396,-0.735515,0.820202
FUEL CONSUMPTION_scaled,-0.106884,0.834071,0.816396,1.0,-0.935042,0.930299
KMXGALON_scaled,0.094868,-0.761322,-0.735515,-0.935042,1.0,-0.921085
EMISSIONS_scaled,-0.054903,0.832865,0.820202,0.930299,-0.921085,1.0


In [54]:
df_pca_cat, model_pca_cat = aplicar_pca(df_cat, variance_threshold=0.9)
df_pca_num, model_pca_num = aplicar_pca(df_num.drop(["EMISSIONS_scaled"], axis=1), variance_threshold=0.9)

df_pca_cat.shape, df_pca_num.shape

((14093, 5), (14093, 2))

In [55]:
y = df_num['EMISSIONS_scaled']
X = pd.concat([df_pca_cat, df_pca_num], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [56]:
# Entrenar LinearRegression con X_train / y_train y evaluar sobre X_test / y_test

model = LinearRegression().fit(X_train, y_train)

# Predicciones
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Métricas
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_pred_train)

mse_test = mean_squared_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)
mae_test = np.mean(np.abs(y_test - y_pred))

# Mostrar resultados
print(f"Train R2: {r2_train:.4f} | Test R2: {r2_test:.4f}")
print(f"Train RMSE: {rmse_train:.4f} | Test RMSE: {rmse_test:.4f}")
print(f"Test MAE: {mae_test:.4f}")
print(f"Test MSE: {mse_test:.4f}")

# Gráfico True vs Predicho (interactividad con plotly)
xmin = min(y_test.min(), y_pred.min())
xmax = max(y_test.max(), y_pred.max())

fig = px.scatter(x=y_test, y=y_pred, labels={'x':'y_true','y':'y_pred'}, title='LinearRegression - True vs Predicted')
fig.update_layout(shapes=[dict(type='line', x0=xmin, x1=xmax, y0=xmin, y1=xmax, line=dict(color='red', dash='dash'))])


Train R2: 0.8858 | Test R2: 0.8915
Train RMSE: 0.3379 | Test RMSE: 0.3296
Test MAE: 0.2439
Test MSE: 0.1086


In [57]:
def evaluar_regresores_holdout(X_train, X_test, y_train, y_test, modelos=None):
    """
    Entrena y evalúa varios REGRESORES en un holdout (ya tienes el split).
    No aplica ningún escalado/transformación.
    Devuelve:
      - DataFrame con métricas por modelo (R2, MAE, RMSE)
      - dict de modelos entrenados
      - dict de predicciones en X_test por modelo
    """
    if modelos is None:
        modelos = {
            "RandomForest": RandomForestRegressor(n_estimators=400, n_jobs=-1, random_state=42),
            "SVR-RBF": SVR(kernel="rbf", C=1.0, gamma="scale", epsilon=0.1),
            "GradientBoosting": GradientBoostingRegressor(random_state=42),
            "Lasso": Lasso(alpha=0.1, random_state=42),
            "LinearRegression": LinearRegression(),
        }

    filas, entrenados, preds = [], {}, {}
    for nombre, est in modelos.items():
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)

        entrenados[nombre] = est
        preds[nombre] = y_pred

        filas.append({
            "modelo": nombre,
            "r2": r2_score(y_test, y_pred),
            "mae": mean_absolute_error(y_test, y_pred),
            "rmse": mean_squared_error(y_test, y_pred),
        })

    resultados = pd.DataFrame(filas).sort_values("rmse").reset_index(drop=True)
    return resultados, entrenados, preds

In [58]:
results, entrenados, preds = evaluar_regresores_holdout(X_train, X_test, y_train, y_test)
results

Unnamed: 0,modelo,r2,mae,rmse
0,RandomForest,0.979858,0.094923,0.02017
1,GradientBoosting,0.957909,0.14434,0.042149
2,SVR-RBF,0.954549,0.153508,0.045514
3,LinearRegression,0.89152,0.243911,0.10863
4,Lasso,0.872881,0.273124,0.127295
