In [1]:
!pip install python-dotenv
!pip install snowflake-connector-python

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1
Collecting snowflake-connector-python
  Downloading snowflake_connector_python-4.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python)
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting filelock<4,>=3.5 (from snowflake-connector-python)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting tomlkit (from snowflake-connector-python)
  Downloading tomlkit-0.13.3-py3-none-any.whl.metadata (2.8 kB)
Collecting boto3>=1.24 (from snowflake-connector-python)
  Downloading bot

In [2]:
from dotenv import load_dotenv
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import Row 
import requests
import json
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import time

In [3]:
#Carga de datos obt + filtrados
import os
import requests
from dotenv import load_dotenv
load_dotenv()

print(f"PORT_POSTGRES: {os.getenv('PORT_POSTGRES')}")
print(f"POSTGRES_DB: {os.getenv('POSTGRES_DB')}")
print(f"POSTGRES_USER: {os.getenv('POSTGRES_USER')}")
print(f"POSTGRES_PASSWORD set: {bool(os.getenv('POSTGRES_PASSWORD'))}")


PORT_POSTGRES: 5432
POSTGRES_DB: ny_taxi
POSTGRES_USER: usuario_spark
POSTGRES_PASSWORD set: True


In [4]:
jar_path = "/home/jovyan/work/postgresql-42.2.5.jar"

spark = SparkSession.builder \
    .appName("ML_Desde_Postgres") \
    .master("local[*]") \
    .config("spark.jars", jar_path) \
    .config("spark.driver.extraClassPath", jar_path) \
    .config("spark.executor.extraClassPath", jar_path) \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

In [6]:
df_training = spark.read.format("jdbc") \
    .option("url", f"jdbc:postgresql://warehouses:5432/{os.getenv('POSTGRES_DB')}") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT * FROM analytics.obt_trips WHERE year = 2022 LIMIT 1000000) AS t1") \
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("fetchsize", "100000").load()

df_validation = spark.read.format("jdbc") \
    .option("url", f"jdbc:postgresql://warehouses:5432/{os.getenv('POSTGRES_DB')}") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT * FROM analytics.obt_trips WHERE year = 2023 LIMIT 200000) AS t2") \
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("fetchsize", "100000").load()

df_testing = spark.read.format("jdbc") \
    .option("url", f"jdbc:postgresql://warehouses:5432/{os.getenv('POSTGRES_DB')}") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT * FROM analytics.obt_trips WHERE year = 2024 LIMIT 200000) AS t3") \
    .option("user", os.getenv('POSTGRES_USER')) \
    .option("password", os.getenv('POSTGRES_PASSWORD')) \
    .option("fetchsize", "100000").load()

In [7]:
def generar_tabla_con_validaciones(df_obt):

    print("Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion")

    try:

        df_obt_sin_nulos = df_obt.filter(F.col("DO_LOCATION_ID").isNotNull() & F.col("PASSENGER_COUNT").isNotNull() & F.col("PAYMENT_TYPE").isNotNull() & F.col("PU_LOCATION_ID").isNotNull() & F.col("RATE_CODE_ID").isNotNull() & F.col("DROPOFF_DATETIME").isNotNull() & F.col("PICKUP_DATETIME").isNotNull() & F.col("TRIP_DISTANCE").isNotNull() & F.col("VENDOR_ID").isNotNull())

        df_con_datos_coherentes = df_obt_sin_nulos.filter((F.col("PASSENGER_COUNT")>0) & (F.col("PASSENGER_COUNT")<10) & (F.col("EXTRA")>=0) & (F.col("FARE_AMOUNT")>=0) & (F.col("TIP_AMOUNT")>=0) & (F.col("TOLLS_AMOUNT")>=0) & (F.col("TOTAL_AMOUNT")>=0) & (F.col("TRIP_DISTANCE")>0) & (F.col("TRIP_DURATION_MIN")>1) & (F.col("TRIP_DURATION_MIN")<180) & (F.col("AVG_SPEED_MPH")>0) & (F.col("AVG_SPEED_MPH")<100) & (F.col("TIP_PCT")>=0) & (F.col("PU_LOCATION_ID").between(1, 265)) & (F.col("DO_LOCATION_ID").between(1, 265)))
        
        df_con_fechas_coherentes= df_con_datos_coherentes.filter((F.col("MONTH")>0) & (F.col("MONTH")<13) & (F.col("YEAR")>=2022) & (F.col("YEAR")<=2024))

        print("Tabla OBT con validaciones generada correctamente")
        
        return df_con_fechas_coherentes
        
    except Exception as e:
        print(f"No se pudo generar la tabla OBT de Taxis con validaciones: {e}")
        raise e

In [8]:
df_obt_training= generar_tabla_con_validaciones(df_training)
df_obt_validation= generar_tabla_con_validaciones(df_validation)
df_obt_testing= generar_tabla_con_validaciones(df_testing)

Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion
Tabla OBT con validaciones generada correctamente
Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion
Tabla OBT con validaciones generada correctamente
Iniciando proceso de carga de datos de la OBT desde Schema Analytics para su validacion
Tabla OBT con validaciones generada correctamente


In [9]:
from pyspark.sql import functions as F

feature_columns = [
    'pickup_datetime', 'pickup_hour', 'pickup_dow', 'month', 'year',
    'pu_location_id', 'pu_zone', 'pu_borough',
    'service_type', 'vendor_id', 'vendor_name', 'rate_code_id', 'rate_code_desc',
    'payment_type', 'payment_type_desc', 'trip_type',
    'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 
    'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 
    'airport_fee', 'store_and_fwd_flag'
]

target_column = 'total_amount'

In [10]:
df_training_prepared = df_obt_training.select(feature_columns + [target_column])
df_validation_prepared = df_obt_validation.select(feature_columns + [target_column])
df_testing_prepared = df_obt_testing.select(feature_columns + [target_column])

In [11]:
from pyspark.sql.types import IntegerType

final_numeric_features = [
    'trip_distance', 
    'passenger_count', 
    'pickup_hour', 
    'pickup_dow', 
    'month', 
    'year',
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
    'improvement_surcharge', 'congestion_surcharge', 'airport_fee'
]

final_categorical_features = [
    'service_type',
    'vendor_name', 
    'rate_code_desc',
    'pu_borough',
    'payment_type_desc'
]

print("Features numéricas:", final_numeric_features)
print("Features categóricas:", final_categorical_features)

Features numéricas: ['trip_distance', 'passenger_count', 'pickup_hour', 'pickup_dow', 'month', 'year', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee']
Features categóricas: ['service_type', 'vendor_name', 'rate_code_desc', 'pu_borough', 'payment_type_desc']


In [12]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd

train_pd= df_training_prepared.toPandas()
val_pd= df_validation_prepared.toPandas()
test_pd= df_testing_prepared.toPandas()

train_clean = train_pd.dropna(subset=final_numeric_features + final_categorical_features)
val_clean = val_pd.dropna(subset=final_numeric_features + final_categorical_features)
test_clean = test_pd.dropna(subset=final_numeric_features + final_categorical_features)

print(f"Después de eliminar nulos:")
print(f"Train: {train_clean.shape}, Val: {val_clean.shape}, Test: {test_clean.shape}")

X_train = train_clean[final_numeric_features + final_categorical_features]
y_train = train_clean[target_column]

X_val = val_clean[final_numeric_features + final_categorical_features]
y_val = val_clean[target_column]

X_test = test_clean[final_numeric_features + final_categorical_features]
y_test = test_clean[target_column]

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), final_numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), final_categorical_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

feature_names = (
    final_numeric_features + 
    list(preprocessor.named_transformers_['cat'].get_feature_names_out(final_categorical_features))
)

X_train_processed = pd.DataFrame(X_train_processed, columns=feature_names)
X_val_processed = pd.DataFrame(X_val_processed, columns=feature_names)
X_test_processed = pd.DataFrame(X_test_processed, columns=feature_names)

print(f"\nDespués del preprocesamiento:")
print(f"X_train: {X_train_processed.shape}")
print(f"X_val: {X_val_processed.shape}")
print(f"X_test: {X_test_processed.shape}")

Después de eliminar nulos:
Train: (27, 28), Val: (187469, 28), Test: (180168, 28)

Después del preprocesamiento:
X_train: (27, 23)
X_val: (187469, 23)
X_test: (180168, 23)


In [13]:
print(f"Distribución del target:")
print(f"Train - Media: {y_train.mean():.2f}, Std: {y_train.std():.2f}")
print(f"Val - Media: {y_val.mean():.2f}, Std: {y_val.std():.2f}")
print(f"Test - Media: {y_test.mean():.2f}, Std: {y_test.std():.2f}")

Distribución del target:


TypeError: unsupported operand type(s) for -: 'float' and 'decimal.Decimal'

In [None]:
X_train_np = X_train_processed.values
X_val_np = X_val_processed.values
X_test_np = X_test_processed.values

y_train_np = y_train.values
y_val_np = y_val.values  
y_test_np = y_test.values

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_model(y_true, y_pred, model_name=""):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name}:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2: {r2:.4f}")
    return rmse, mae, r2

y_train_mean = np.mean(y_train_np)
y_val_baseline = np.full_like(y_val_np, y_train_mean)
y_test_baseline = np.full_like(y_test_np, y_train_mean)

print("Baseline")
rmse_base_val, mae_base_val, r2_base_val = evaluate_model(y_val_np, y_val_baseline, "Val Baseline")
rmse_base_test, mae_base_test, r2_base_test = evaluate_model(y_test_np, y_test_baseline, "Test Baseline")

In [None]:
import time
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_np)
X_val_poly = poly.transform(X_val_np)
X_test_poly = poly.transform(X_test_np)

print(f"Con Polynomial Features: {X_train_poly.shape}")

class SGDRegressorScratch:
    def __init__(self, learning_rate=0.01, max_iter=1000, alpha=0.0, tol=1e-4, random_state=42):
        self.lr = learning_rate
        self.max_iter = max_iter
        self.alpha = alpha
        self.tol = tol
        self.random_state = random_state
        
    def fit(self, X, y):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        self.weights = np.random.normal(0, 0.01, n_features)
        self.bias = 0
        
        for i in range(self.max_iter):
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            total_loss = 0
            for j in range(n_samples):
                y_pred = np.dot(X_shuffled[j], self.weights) + self.bias
                
                error = y_pred - y_shuffled[j]
                dw = error * X_shuffled[j] + self.alpha * self.weights
                db = error
                
                self.weights -= self.lr * dw
                self.bias -= self.lr * db
                
                total_loss += error ** 2
            
            avg_loss = total_loss / n_samples
            if i > 0 and abs(avg_loss - prev_loss) < self.tol:
                print(f"Converge en la iteracion {i}")
                break
            prev_loss = avg_loss
            
        return self
    
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

class RidgeRegressionScratch:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        A = np.dot(X.T, X) + self.alpha * np.eye(n_features)
        b = np.dot(X.T, y)
        self.weights = np.linalg.solve(A, b)
        return self
    
    def predict(self, X):
        return np.dot(X, self.weights)

class LassoRegressionScratch:
    def __init__(self, alpha=1.0, max_iter=1000, tol=1e-4):
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        
    def soft_threshold(self, rho, lamda):
        if rho < -lamda:
            return rho + lamda
        elif rho > lamda:
            return rho - lamda
        else:
            return 0
            
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        
        for iteration in range(self.max_iter):
            weights_old = self.weights.copy()
            
            for j in range(n_features):
                y_pred = np.dot(X, self.weights)
                rho_j = np.dot(X[:, j], y - y_pred + self.weights[j] * X[:, j])
                
                self.weights[j] = self.soft_threshold(rho_j, self.alpha * n_samples) / np.dot(X[:, j], X[:, j])
            
            if np.max(np.abs(self.weights - weights_old)) < self.tol:
                print(f"Converge en la iteracion {iteration}")
                break
                
        return self
    
    def predict(self, X):
        return np.dot(X, self.weights)


print("Modelos Scratch")

models_scratch = {
    'SGD_Scratch': SGDRegressorScratch(learning_rate=0.001, max_iter=1000, alpha=0.01),
    'Ridge_Scratch': RidgeRegressionScratch(alpha=1.0),
    'Lasso_Scratch': LassoRegressionScratch(alpha=0.1, max_iter=1000)
}

results_scratch = {}

for name, model in models_scratch.items():
    print(f"\nEntrenando {name}...")
    start_time = time.time()
    
    model.fit(X_train_poly, y_train_np)
    train_time = time.time() - start_time
    
    y_val_pred = model.predict(X_val_poly)
    y_test_pred = model.predict(X_test_poly)
    
    rmse_val, mae_val, r2_val = evaluate_model(y_val_np, y_val_pred, f"Val {name}")
    rmse_test, mae_test, r2_test = evaluate_model(y_test_np, y_test_pred, f"Test {name}")
    
    results_scratch[name] = {
        'val_rmse': rmse_val, 'val_mae': mae_val, 'val_r2': r2_val,
        'test_rmse': rmse_test, 'test_mae': mae_test, 'test_r2': r2_test,
        'train_time': train_time,
        'n_features': X_train_poly.shape[1],
        'n_coef_nonzero': np.sum(np.abs(model.weights) > 1e-6) if hasattr(model, 'weights') else None
    }

In [None]:
from sklearn.linear_model import SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print("Modelos Scikit")

# Definir pipelines equivalentes
pipelines = {
    'SGD_Sklearn': Pipeline([
        ('regressor', SGDRegressor(random_state=42))
    ]),
    'Ridge_Sklearn': Pipeline([
        ('regressor', Ridge(random_state=42))
    ]),
    'Lasso_Sklearn': Pipeline([
        ('regressor', Lasso(random_state=42))
    ]),
    'ElasticNet_Sklearn': Pipeline([
        ('regressor', ElasticNet(random_state=42))
    ])
}

# Grid search parameters
param_grids = {
    'SGD_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1],
        'regressor__learning_rate': ['constant', 'adaptive'],
        'regressor__eta0': [0.001, 0.01],
        'regressor__max_iter': [1000, 2000]
    },
    'Ridge_Sklearn': {
        'regressor__alpha': [0.1, 1.0, 10.0, 100.0]
    },
    'Lasso_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1, 1.0]
    },
    'ElasticNet_Sklearn': {
        'regressor__alpha': [0.001, 0.01, 0.1],
        'regressor__l1_ratio': [0.2, 0.5, 0.8]
    }
}

results_sklearn = {}

for name, pipeline in pipelines.items():
    print(f"\nEntrenando {name}...")
    start_time = time.time()
    
    grid_search = GridSearchCV(
        pipeline, param_grids[name], 
        cv=3, scoring='neg_mean_squared_error', 
        n_jobs=-1, verbose=0
    )
    
    grid_search.fit(X_train_poly, y_train_np)
    train_time = time.time() - start_time
    
    best_model = grid_search.best_estimator_
    y_val_pred = best_model.predict(X_val_poly)
    y_test_pred = best_model.predict(X_test_poly)
    
    rmse_val, mae_val, r2_val = evaluate_model(y_val_np, y_val_pred, f"Val {name}")
    rmse_test, mae_test, r2_test = evaluate_model(y_test_np, y_test_pred, f"Test {name}")
    
    results_sklearn[name] = {
        'val_rmse': rmse_val, 'val_mae': mae_val, 'val_r2': r2_val,
        'test_rmse': rmse_test, 'test_mae': mae_test, 'test_r2': r2_test,
        'train_time': train_time,
        'best_params': grid_search.best_params_,
        'n_coef_nonzero': np.sum(np.abs(best_model.named_steps['regressor'].coef_) > 1e-6)
    }
    
    print(f"Mejores parámetros: {grid_search.best_params_}")

In [None]:
import pandas as pd

print("Comparación Modelos")

comparison_data = []

comparison_data.append({
    'Model': 'Baseline_Mean',
    'Type': 'Baseline',
    'Val_RMSE': rmse_base_val,
    'Test_RMSE': rmse_base_test,
    'Val_MAE': mae_base_val, 
    'Test_MAE': mae_base_test,
    'Val_R2': r2_base_val,
    'Test_R2': r2_base_test,
    'Train_Time': 0,
    'Nonzero_Coeffs': 0
})

for name, results in results_scratch.items():
    comparison_data.append({
        'Model': name,
        'Type': 'From_Scratch',
        'Val_RMSE': results['val_rmse'],
        'Test_RMSE': results['test_rmse'],
        'Val_MAE': results['val_mae'],
        'Test_MAE': results['test_mae'],
        'Val_R2': results['val_r2'],
        'Test_R2': results['test_r2'],
        'Train_Time': results['train_time'],
        'Nonzero_Coeffs': results['n_coef_nonzero'] or results['n_features']
    })

for name, results in results_sklearn.items():
    comparison_data.append({
        'Model': name,
        'Type': 'Sklearn',
        'Val_RMSE': results['val_rmse'],
        'Test_RMSE': results['test_rmse'],
        'Val_MAE': results['val_mae'],
        'Test_MAE': results['test_mae'],
        'Val_R2': results['val_r2'],
        'Test_R2': results['test_r2'],
        'Train_Time': results['train_time'],
        'Nonzero_Coeffs': results['n_coef_nonzero']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.round(4)

print("Resultados ordenados por RMSE de Validación:")
display(comparison_df.sort_values('Val_RMSE'))

# Mejor modelo
best_model_row = comparison_df.loc[comparison_df['Val_RMSE'].idxmin()]
print(f"Mejor Modelo: {best_model_row['Model']}")
print(f"RMSE Validación: {best_model_row['Val_RMSE']:.2f}")
print(f"RMSE Test: {best_model_row['Test_RMSE']:.2f}")
print(f"R cuadrado Test: {best_model_row['Test_R2']:.4f}")

In [None]:
print("Conclusiones")

print(f"• Mejor modelo: {best_model_row['Model']}")
print(f"• Mejor RMSE test: {best_model_row['Test_RMSE']:.2f}")
print(f"• Mejor R cuadrado test: {best_model_row['Test_R2']:.4f}")
print(f"• Improvement vs baseline: {((rmse_base_test - best_model_row['Test_RMSE']) / rmse_base_test * 100):.1f}%")

print(f"Observaciones:")
scratch_models = comparison_df[comparison_df['Type'] == 'From_Scratch']
sklearn_models = comparison_df[comparison_df['Type'] == 'Sklearn']

if not scratch_models.empty and not sklearn_models.empty:
    avg_scratch_rmse = scratch_models['Test_RMSE'].mean()
    avg_sklearn_rmse = sklearn_models['Test_RMSE'].mean()
    print(f"RMSE promedio From-Scratch: {avg_scratch_rmse:.2f}")
    print(f"RMSE promedio Scikit-learn: {avg_sklearn_rmse:.2f}")
    print(f"Diferencia: {avg_scratch_rmse - avg_sklearn_rmse:.2f}")

print(f"Recomendaciones")
print("1. Reentrenar mensualmente con datos nuevos")
print("2. Monitorear drift de datos y performance")
print("3. Considerar agregar features externas (clima, eventos)")
print("4. Implementar sistema de logging de predicciones")
print("5. Establecer thresholds de alerta para degradación del modelo")