In [1]:
!pip install python-dotenv
!pip install snowflake-connector-python



In [17]:
from dotenv import load_dotenv
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row 
import requests
import json
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import time

In [18]:
#Carga de datos obt + filtrados
import os
import requests
from dotenv import load_dotenv
load_dotenv()

print(f"PORT_POSTGRES: {os.getenv('PORT_POSTGRES')}")
print(f"POSTGRES_DB: {os.getenv('POSTGRES_DB')}")
print(f"POSTGRES_USER: {os.getenv('POSTGRES_USER')}")
print(f"POSTGRES_PASSWORD set: {bool(os.getenv('POSTGRES_PASSWORD'))}")


PORT_POSTGRES: 5432
POSTGRES_DB: ny_taxi
POSTGRES_USER: usuario_spark
POSTGRES_PASSWORD set: True


In [19]:
jar_path = "/home/jovyan/work/postgresql-42.2.5.jar"

spark = SparkSession.builder \
    .appName("ML_Desde_Postgres") \
    .master("local[*]") \
    .config("spark.jars", jar_path) \
    .config("spark.driver.extraClassPath", jar_path) \
    .config("spark.executor.extraClassPath", jar_path) \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

In [20]:
df_obt = spark.read.format("jdbc") \
    .option("url", f"jdbc:postgresql://warehouses:5432/{os.getenv('POSTGRES_DB')}") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT * FROM analytics.obt_trips LIMIT 1000000) AS t1") \ #Si hacia con mas datos moria el kernel
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("fetchsize", "100000").load()

In [21]:
def generar_tabla_con_validaciones(df_obt):

    print("Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion")

    try:
        df_obt_sin_nulos = df_obt.filter(F.col("DO_LOCATION_ID").isNotNull() & F.col("PASSENGER_COUNT").isNotNull() & F.col("PAYMENT_TYPE").isNotNull() & F.col("PU_LOCATION_ID").isNotNull() & F.col("RATE_CODE_ID").isNotNull() & F.col("DROPOFF_DATETIME").isNotNull() & F.col("PICKUP_DATETIME").isNotNull() & F.col("TRIP_DISTANCE").isNotNull() & F.col("VENDOR_ID").isNotNull())

        df_con_datos_coherentes = df_obt_sin_nulos.filter((F.col("PASSENGER_COUNT")>0) & (F.col("PASSENGER_COUNT")<10) & (F.col("EXTRA")>=0) & (F.col("FARE_AMOUNT")>=0) & (F.col("TIP_AMOUNT")>=0) & (F.col("TOLLS_AMOUNT")>=0) & (F.col("TOTAL_AMOUNT")>=0) & (F.col("TRIP_DISTANCE")>0) & (F.col("TRIP_DURATION_MIN")>1) & (F.col("TRIP_DURATION_MIN")<180) & (F.col("AVG_SPEED_MPH")>0) & (F.col("AVG_SPEED_MPH")<100) & (F.col("TIP_PCT")>=0) & (F.col("PU_LOCATION_ID").between(1, 265)) & (F.col("DO_LOCATION_ID").between(1, 265)))
        
        df_con_fechas_coherentes= df_con_datos_coherentes.filter((F.col("MONTH")>0) & (F.col("MONTH")<13) & (F.col("YEAR")>=2022) & (F.col("YEAR")<=2024))

        print("Tabla OBT con validaciones generada correctamente")
        
        return df_con_fechas_coherentes
        
    except Exception as e:
        print(f"No se pudo generar la tabla OBT de Taxis con validaciones: {e}")
        raise e

In [22]:
df_obt_validado= generar_tabla_con_validaciones(df_obt)

Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion
Tabla OBT con validaciones generada correctamente


In [23]:
from pyspark.sql import functions as F

feature_columns = [
    'pickup_datetime', 'pickup_hour', 'pickup_dow', 'month', 'year',
    'pu_location_id', 'pu_zone', 'pu_borough',
    'service_type', 'vendor_id', 'vendor_name', 'rate_code_id', 'rate_code_desc',
    'payment_type', 'payment_type_desc', 'trip_type',
    'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 
    'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 
    'airport_fee', 'store_and_fwd_flag'
]

target_column = 'total_amount'

In [24]:
df_obt_preparado = df_obt_validado.select(feature_columns + [target_column])

In [25]:
from pyspark.sql.types import IntegerType

final_numeric_features = [
    'trip_distance', 
    'passenger_count', 
    'pickup_hour', 
    'pickup_dow', 
    'month', 
    'year',
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
    'improvement_surcharge', 'congestion_surcharge', 'airport_fee'
]

final_categorical_features = [
    'service_type',
    'vendor_name', 
    'rate_code_desc',
    'pu_borough',
    'payment_type_desc'
]

print("Features numéricas:", final_numeric_features)
print("Features categóricas:", final_categorical_features)

Features numéricas: ['trip_distance', 'passenger_count', 'pickup_hour', 'pickup_dow', 'month', 'year', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee']
Features categóricas: ['service_type', 'vendor_name', 'rate_code_desc', 'pu_borough', 'payment_type_desc']


In [26]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

df_obt_preparado_pd= df_obt_preparado.toPandas()

df_clean = df_obt_preparado_pd.dropna(subset=final_numeric_features + final_categorical_features)

print(f"Después de eliminar nulos:")
print(f"DF Shape: {df_clean.shape}")

percentage1 = 0.60
percentage2 = 0.20
percentage3 = 0.20

train_clean = df_clean.sample(frac=percentage1, random_state=42)
remaining_df = df_clean.drop(train_clean.index)

val_clean = remaining_df.sample(frac=(percentage2 / (1 - percentage1)), random_state=42)
test_clean = remaining_df.drop(val_clean.index)

print(f"Shape of train_clean (60%): {train_clean.shape}")
print(f"Shape of val_clean (20%): {val_clean.shape}")
print(f"Shape of test_clean (20%): {test_clean.shape}")

X_train = train_clean[final_numeric_features + final_categorical_features]
y_train = train_clean[target_column]

X_val = val_clean[final_numeric_features + final_categorical_features]
y_val = val_clean[target_column]

X_test = test_clean[final_numeric_features + final_categorical_features]
y_test = test_clean[target_column]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), final_numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), final_categorical_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

feature_names = (
    final_numeric_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(final_categorical_features))
)

X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
X_val_processed = pd.DataFrame(X_val_processed, columns=feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)

print(f"\nDespués del preprocesamiento:")
print(f"X_train: {X_train_processed.shape}")
print(f"X_val: {X_val_processed.shape}")
print(f"X_test: {X_test_processed.shape}")

print(X_train_processed.head())

Después de eliminar nulos:
DF Shape: (92476, 28)
Shape of train_clean (60%): (55486, 28)
Shape of val_clean (20%): (18495, 28)
Shape of test_clean (20%): (18495, 28)

Después del preprocesamiento:
X_train: (55486, 34)
X_val: (18495, 34)
X_test: (18495, 34)
   trip_distance  passenger_count  pickup_hour  pickup_dow   month  year  \
0       0.115957        -0.454585     0.965147   -1.536966 -0.0315   0.0   
1      -0.243912        -0.454585     0.447490    0.434042 -0.0315   0.0   
2      -0.643766        -0.454585    -0.932929   -1.044214 -0.0315   0.0   
3       3.288915        -0.454585     0.102385   -1.044214 -0.0315   0.0   
4      -0.547330         0.563378     1.137699    0.926794 -0.0315   0.0   

   fare_amount     extra   mta_tax  tip_amount  ...  pu_borough_Brooklyn  \
0     0.107199 -0.455420  0.037035   -0.916759  ...                  0.0   
1    -0.152263 -0.050186  0.037035    0.270296  ...                  0.0   
2    -0.671186  1.165517  0.037035   -0.916759  ...       

In [27]:
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

print(f"Distribución del target:")
print(f"Train - Media: {y_train.mean():.2f}, Std: {y_train.std():.2f}")
print(f"Val - Media: {y_val.mean():.2f}, Std: {y_val.std():.2f}")
print(f"Test - Media: {y_test.mean():.2f}, Std: {y_test.std():.2f}")

Distribución del target:
Train - Media: 20.86, Std: 14.42
Val - Media: 20.87, Std: 14.32
Test - Media: 20.80, Std: 14.31


In [28]:
X_train_np = X_train_processed.values
X_val_np = X_val_processed.values
X_test_np = X_test_processed.values

y_train_np = y_train.values
y_val_np = y_val.values  
y_test_np = y_test.values

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_model(y_true, y_pred, model_name=""):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name}:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R cuadrado: {r2:.4f}")
    return rmse, mae, r2

y_train_mean = np.mean(y_train_np)
y_val_baseline = np.full_like(y_val_np, y_train_mean)
y_test_baseline = np.full_like(y_test_np, y_train_mean)

print("Baseline")
rmse_base_val, mae_base_val, r2_base_val = evaluate_model(y_val_np, y_val_baseline, "Val Baseline")
rmse_base_test, mae_base_test, r2_base_test = evaluate_model(y_test_np, y_test_baseline, "Test Baseline")

Baseline
Val Baseline:
RMSE: 14.32
MAE: 9.76
R cuadrado: -0.0000
Test Baseline:
RMSE: 14.31
MAE: 9.78
R cuadrado: -0.0000


In [33]:
import time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import numpy as np
from itertools import product

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_np)
X_val_poly = poly.transform(X_val_np)
X_test_poly = poly.transform(X_test_np)

print(f"Con Polynomial Features: {X_train_poly.shape}")

scaler_poly = StandardScaler()
X_train_poly = scaler_poly.fit_transform(X_train_poly)
X_val_poly = scaler_poly.transform(X_val_poly)
X_test_poly = scaler_poly.transform(X_test_poly)

class SGDRegressorScratch:
    def __init__(self, learning_rate=1e-4, alpha=0.01, max_iter=100, tol=1e-4, batch_size=64, clip_value=1e3, random_state=42):
        self.lr = learning_rate
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.batch_size = batch_size
        self.clip_value = clip_value
        self.random_state = random_state

    def fit(self, X, y):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        self.weights = np.random.normal(0, 0.01, n_features)
        self.bias = 0.0
        prev_loss = np.inf

        for epoch in range(self.max_iter):
            indices = np.random.permutation(n_samples)
            X, y = X[indices], y[indices]
            total_loss = 0.0

            for start in range(0, n_samples, self.batch_size):
                end = start + self.batch_size
                X_batch, y_batch = X[start:end], y[start:end]

                y_pred = np.dot(X_batch, self.weights) + self.bias
                errors = y_pred - y_batch

                dw = np.dot(X_batch.T, errors) / len(X_batch) + self.alpha * self.weights
                db = np.mean(errors)

                dw = np.clip(dw, -self.clip_value, self.clip_value)
                db = np.clip(db, -self.clip_value, self.clip_value)

                self.weights -= self.lr * dw
                self.bias -= self.lr * db

                total_loss += np.mean(errors ** 2)

            avg_loss = total_loss / (n_samples // self.batch_size)
            if not np.isfinite(avg_loss):
                print(f"Overflow detectado en epoch {epoch}, deteniendo.")
                break

            if abs(prev_loss - avg_loss) < self.tol:
                print(f"Convergió en epoch {epoch+1} con pérdida {avg_loss:.6f}")
                break

            prev_loss = avg_loss

        return self

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias


class RidgeRegressionScratch:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        n_features = X.shape[1]
        A = np.dot(X.T, X) + self.alpha * np.eye(n_features)
        b = np.dot(X.T, y)
        self.weights = np.linalg.solve(A, b)
        return self

    def predict(self, X):
        return np.dot(X, self.weights)

class LassoRegressionScratch:
    def __init__(self, alpha=1.0, max_iter=100, tol=1e-4):
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol

    def soft_threshold(self, rho, lamda):
        return np.sign(rho) * np.maximum(np.abs(rho) - lamda, 0.0)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)

        X_T = X.T
        X_sq_sum = np.sum(X ** 2, axis=0)

        for iteration in range(self.max_iter):
            weights_old = self.weights.copy()
            y_pred = X @ self.weights

            for j in range(n_features):
                residual = y - (y_pred - X[:, j] * self.weights[j])
                rho = np.dot(X[:, j], residual)
                if X_sq_sum[j] != 0:
                    self.weights[j] = self.soft_threshold(rho, self.alpha * n_samples) / X_sq_sum[j]
                    # Actualizar solo el cambio local en y_pred
                    y_pred += X[:, j] * (self.weights[j] - weights_old[j])

            if np.max(np.abs(self.weights - weights_old)) < self.tol:
                print(f"Lasso convergió en {iteration+1} iteraciones.")
                break
        return self

    def predict(self, X):
        return X @ self.weights


class ElasticNetScratch:
    def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=100, tol=1e-4, lr=1e-3):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.max_iter = max_iter
        self.tol = tol
        self.lr = lr

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0.0
        prev_loss = np.inf

        for it in range(self.max_iter):
            y_pred = X @ self.weights + self.bias
            errors = y_pred - y

            grad_w = (X.T @ errors) / n_samples
            grad_b = np.mean(errors)

            l1_grad = self.l1_ratio * np.sign(self.weights)
            l2_grad = (1 - self.l1_ratio) * self.weights
            grad_total = grad_w + self.alpha * (l1_grad + l2_grad)

            self.weights -= self.lr * grad_total
            self.bias -= self.lr * grad_b

            loss = np.mean(errors ** 2)
            if abs(prev_loss - loss) < self.tol:
                print(f"ElasticNet convergió en {it+1} iteraciones.")
                break
            prev_loss = loss
        return self

    def predict(self, X):
        return X @ self.weights + self.bias

def grid_search(models, param_grids, X_train, y_train, X_val, y_val):
    results = []
    for model_name, model_class in models.items():
        grid = param_grids[model_name]
        for params in product(*grid.values()):
            params_dict = dict(zip(grid.keys(), params))
            model = model_class(**params_dict)

            start = time.time()
            model.fit(X_train, y_train)
            end = time.time()

            preds = model.predict(X_val)
            mse = np.mean((y_val - preds) ** 2)

            results.append({
                'model': model_name,
                **params_dict,
                'mse': mse,
                'time_sec': round(end - start, 4)
            })
    return results


print("Modelos Scratch")

X_train_poly = X_train_poly.astype(np.float32)
X_val_poly = X_val_poly.astype(np.float32)

X_train = X_train_poly.values if hasattr(X_train_poly, "values") else X_train_poly
X_val = X_val_poly.values if hasattr(X_val_poly, "values") else X_val_poly

models = {
    'SGD': SGDRegressorScratch,
    'Ridge': RidgeRegressionScratch,
    'Lasso': LassoRegressionScratch,
    'ElasticNet': ElasticNetScratch
}

param_grids = {
    'SGD': {'learning_rate': [1e-3, 1e-4], 'alpha': [0.0, 0.01], 'max_iter': [100]},
    'Ridge': {'alpha': [0.1, 1.0]},
    'Lasso': {'alpha': [0.001, 0.01]},
    'ElasticNet': {'alpha': [0.01, 0.1], 'l1_ratio': [0.3, 0.7]}
}

results = grid_search(models, param_grids, X_train, y_train_np, X_val, y_val_np)
df_results = pd.DataFrame(results).sort_values(by="mse")
print(df_results)

Con Polynomial Features: (55486, 595)
Modelos Scratch
         model  learning_rate  alpha  max_iter          mse  time_sec  \
3          SGD         0.0001  0.010     100.0     1.853734  335.0014   
2          SGD         0.0001  0.000     100.0     2.065233  396.7648   
1          SGD         0.0010  0.010     100.0    96.210672  367.5094   
11  ElasticNet            NaN  0.100       NaN   369.790232   46.9151   
9   ElasticNet            NaN  0.010       NaN   369.861856   43.8928   
10  ElasticNet            NaN  0.100       NaN   369.866887   50.4890   
8   ElasticNet            NaN  0.010       NaN   369.877099   43.7915   
0          SGD         0.0010  0.000     100.0   414.112553  420.1475   
7        Lasso            NaN  0.010       NaN   435.021673  362.4172   
6        Lasso            NaN  0.001       NaN   435.812490  372.4333   
5        Ridge            NaN  1.000       NaN   436.331922    1.2097   
4        Ridge            NaN  0.100       NaN  2008.447950    1.6891 

In [34]:
from sklearn.linear_model import SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print("Modelos Scikit")

# Definir pipelines equivalentes
pipelines = {
    'SGD_Sklearn': Pipeline([
        ('regressor', SGDRegressor(random_state=42))
    ]),
    'Ridge_Sklearn': Pipeline([
        ('regressor', Ridge(random_state=42))
    ]),
    'Lasso_Sklearn': Pipeline([
        ('regressor', Lasso(random_state=42))
    ]),
    'ElasticNet_Sklearn': Pipeline([
        ('regressor', ElasticNet(random_state=42))
    ])
}

# Grid search parameters
param_grids = {
    'SGD_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1],
        'regressor__learning_rate': ['constant', 'adaptive'],
        'regressor__eta0': [0.001, 0.01],
        'regressor__max_iter': [1000, 2000]
    },
    'Ridge_Sklearn': {
        'regressor__alpha': [0.1, 1.0, 10.0, 100.0]
    },
    'Lasso_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1, 1.0]
    },
    'ElasticNet_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1],
        'regressor__l1_ratio': [0.2, 0.5, 0.8]
    }
}

results_sklearn = {}

for name, pipeline in pipelines.items():
    print(f"\nEntrenando {name}...")
    start_time = time.time()
    
    grid_search = GridSearchCV(
        pipeline, param_grids[name], 
        cv=3, scoring='neg_mean_squared_error', 
        n_jobs=-1, verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val)

    X_test_poly = X_test_poly.astype(float)
    X_test = X_test_poly.values if hasattr(X_test_poly, "values") else X_test_poly
    y_test_pred = best_model.predict(X_test)
    
    rmse_val, mae_val, r2_val = evaluate_model(y_val_np, y_val_pred, f"Val {name}")
    rmse_test, mae_test, r2_test = evaluate_model(y_test_np, y_test_pred, f"Test {name}")
    
    results_sklearn[name] = {
        'val_rmse': rmse_val, 'val_mae': mae_val, 'val_r2': r2_val,
        'test_rmse': rmse_test, 'test_mae': mae_test, 'test_r2': r2_test,
        'train_time': train_time,
        'best_params': grid_search.best_params_,
        'n_coef_nonzero': np.sum(np.abs(best_model.named_steps['regressor'].coef_) > 1e-6)
    }
    
    print(f"Mejores parámetros: {grid_search.best_params_}")

Modelos Scikit

Entrenando SGD_Sklearn...
Val SGD_Sklearn:
RMSE: 1.47
MAE: 0.39
R cuadrado: 0.9895
Test SGD_Sklearn:
RMSE: 0.83
MAE: 0.38
R cuadrado: 0.9966
Mejores parámetros: {'regressor__alpha': 0.1, 'regressor__eta0': 0.001, 'regressor__learning_rate': 'adaptive', 'regressor__max_iter': 2000}

Entrenando Ridge_Sklearn...
Val Ridge_Sklearn:
RMSE: 0.13
MAE: 0.00
R cuadrado: 0.9999
Test Ridge_Sklearn:
RMSE: 0.13
MAE: 0.00
R cuadrado: 0.9999
Mejores parámetros: {'regressor__alpha': 0.1}

Entrenando Lasso_Sklearn...
Val Lasso_Sklearn:
RMSE: 0.02
MAE: 0.01
R cuadrado: 1.0000
Test Lasso_Sklearn:
RMSE: 0.02
MAE: 0.01
R cuadrado: 1.0000
Mejores parámetros: {'regressor__alpha': 0.001}

Entrenando ElasticNet_Sklearn...


  model = cd_fast.enet_coordinate_descent(


Val ElasticNet_Sklearn:
RMSE: 0.02
MAE: 0.01
R cuadrado: 1.0000
Test ElasticNet_Sklearn:
RMSE: 0.02
MAE: 0.01
R cuadrado: 1.0000
Mejores parámetros: {'regressor__alpha': 0.001, 'regressor__l1_ratio': 0.8}


In [38]:
import pandas as pd

print("Comparación de Modelos")

comparison_data = []

comparison_data.append({
    'Model': 'Baseline_Mean',
    'Type': 'Baseline',
    'Val_RMSE': rmse_base_val,
    'Test_RMSE': rmse_base_test,
    'Val_MAE': mae_base_val,
    'Test_MAE': mae_base_test,
    'Val_R2': r2_base_val,
    'Test_R2': r2_base_test,
    'Train_Time': 0,
    'Nonzero_Coeffs': 0
})

for name, results in df_results.items():
    comparison_data.append({
        'Model': name,
        'Type': 'From_Scratch',
        'Val_RMSE': results.get('val_rmse', None),
        'Test_RMSE': results.get('test_rmse', None),
        'Val_MAE': results.get('val_mae', None),
        'Test_MAE': results.get('test_mae', None),
        'Val_R2': results.get('val_r2', None),
        'Test_R2': results.get('test_r2', None),
        'Train_Time': results.get('train_time', None),
        'Nonzero_Coeffs': results.get('n_coef_nonzero', results.get('n_features', 0))
    })

for name, results in results_sklearn.items():
    comparison_data.append({
        'Model': name,
        'Type': 'Sklearn',
        'Val_RMSE': results.get('val_rmse', None),
        'Test_RMSE': results.get('test_rmse', None),
        'Val_MAE': results.get('val_mae', None),
        'Test_MAE': results.get('test_mae', None),
        'Val_R2': results.get('val_r2', None),
        'Test_R2': results.get('test_r2', None),
        'Train_Time': results.get('train_time', None),
        'Nonzero_Coeffs': results.get('n_coef_nonzero', 0)
    })

comparison_df = pd.DataFrame(comparison_data).round(4)

print("\nResultados ordenados por RMSE de Validación:")
display(comparison_df.sort_values(by='Val_RMSE', ascending=True))

if not comparison_df.empty:
    best_model_row = comparison_df.loc[comparison_df['Val_RMSE'].idxmin()]
    print(f"\nMejor Modelo: {best_model_row['Model']}")
    print(f"RMSE Validación: {best_model_row['Val_RMSE']:.2f}")
    print(f"RMSE Test: {best_model_row['Test_RMSE']:.2f}")
    print(f"R cuadrado Test: {best_model_row['Test_R2']:.4f}")
else:
    print("No hay resultados disponibles para comparar.")

Comparación de Modelos

Resultados ordenados por RMSE de Validación:


Unnamed: 0,Model,Type,Val_RMSE,Test_RMSE,Val_MAE,Test_MAE,Val_R2,Test_R2,Train_Time,Nonzero_Coeffs
10,Lasso_Sklearn,Sklearn,0.0233,0.0155,0.007,0.0068,1.0,1.0,571.4129,64
11,ElasticNet_Sklearn,Sklearn,0.0234,0.0152,0.0063,0.006,1.0,1.0,648.0391,92
9,Ridge_Sklearn,Sklearn,0.1299,0.1255,0.0022,0.0019,0.9999,0.9999,18.1259,556
8,SGD_Sklearn,Sklearn,1.4699,0.8344,0.3864,0.3835,0.9895,0.9966,653.0991,470
0,Baseline_Mean,Baseline,14.3174,14.3088,9.7574,9.7779,-0.0,-0.0,0.0,0
1,model,From_Scratch,,,,,,,,0
2,learning_rate,From_Scratch,,,,,,,,0
3,alpha,From_Scratch,,,,,,,,0
4,max_iter,From_Scratch,,,,,,,,0
5,mse,From_Scratch,,,,,,,,0



Mejor Modelo: Lasso_Sklearn
RMSE Validación: 0.02
RMSE Test: 0.02
R cuadrado Test: 1.0000
