In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
ruta_csv = "/content/drive/MyDrive/3 - Master Computacion y Sistemas Inteligentes/PFM/Datasets/dataset_final_convariablesdedias.csv"
df = pd.read_csv(ruta_csv, low_memory=False)
df["con_fecha"] = pd.to_datetime(df["con_fecha"], errors="coerce")
df["con_dateticket"] = pd.to_datetime(df["con_dateticket"], errors="coerce")
df['fecha_embarque'] = pd.to_datetime(df['fecha_embarque'], errors='coerce')

In [None]:
columnas_a_mantener = [
    'fecha_embarque','tipo_agrupacion',
    'dia_del_anio_sin', 'dia_del_anio_cos',
    'dia_embarque_sin', 'dia_embarque_cos',
    'hora_embarque_sin', 'hora_embarque_cos',
    'is_weekend',
    'mes_embarque_sin', 'mes_embarque_cos',
    'week_of_year_sin', 'week_of_year_cos',
    'season_1', 'season_2', 'season_3', 'season_4',
    'is_festivo_nacional', 'is_festivo_local',
    'is_eid_aladha', 'is_eid_aladha_prev', 'is_eid_aladha_post',
    'is_eid_alfitr', 'is_eid_alfitr_prev', 'is_eid_alfitr_post',
    'is_mawlid_nabi',
    'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday',
    'is_friday', 'is_saturday', 'is_sunday',
    'weekday_sin', 'weekday_cos'
]
df = df[columnas_a_mantener]

In [None]:
df_pasajeros = df[df["tipo_agrupacion"] == 1].copy()
len(df_pasajeros)

In [None]:
df_vehiculos = df[df["tipo_agrupacion"] == 0].copy()
len(df_vehiculos)

In [None]:
print("\n=== Verificacion inicial de dias faltantes ===")
rango_esperado = pd.date_range(
    start=df['fecha_embarque'].min(),
    end=df['fecha_embarque'].max(),
    freq='D'
)
fechas_presentes = df['fecha_embarque'].dt.normalize().unique()
fechas_faltantes = rango_esperado.difference(fechas_presentes)
print(f"Rango temporal total: {rango_esperado.min().date()} a {rango_esperado.max().date()}")
print(f"Dias esperados: {len(rango_esperado)} | Dias presentes: {len(fechas_presentes)}")
print(f"Dias faltantes: {len(fechas_faltantes)}")
if len(fechas_faltantes) > 0:
    print("Ejemplo de fechas faltantes:", [f.date() for f in fechas_faltantes[:5]])

In [None]:
def procesar_dataset_completo(df_original, df_filtrado, nombre_target):
    rango_fechas = pd.date_range(
        start=df_original['fecha_embarque'].min(),
        end=df_original['fecha_embarque'].max(),
        freq='D'
    )

    df_daily = (
        df_filtrado.groupby('fecha_embarque')
        .size()
        .reindex(rango_fechas, fill_value=0)
        .reset_index(name=nombre_target)
        .rename(columns={'index': 'fecha_embarque'})
    )

    df_temp = (
        df_original.drop(columns=['tipo_agrupacion'])
        .drop_duplicates('fecha_embarque')
        .set_index('fecha_embarque')
        .reindex(rango_fechas)
        .reset_index()
        .rename(columns={'index': 'fecha_embarque'})
    )

    cols_categoricas = ['is_weekend', 'is_festivo_nacional', 'is_festivo_local',
                       'is_eid_aladha', 'is_eid_aladha_prev', 'is_eid_aladha_post',
                       'is_eid_alfitr', 'is_eid_alfitr_prev', 'is_eid_alfitr_post',
                       'is_mawlid_nabi', 'season_1', 'season_2', 'season_3', 'season_4',
                       'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday',
                       'is_friday', 'is_saturday', 'is_sunday']

    for col in cols_categoricas:
        df_temp[col] = df_temp[col].fillna(method='ffill').fillna(0)

    df_temp['dia_del_anio'] = df_temp['fecha_embarque'].dt.dayofyear
    df_temp['dia_del_anio_sin'] = np.sin(2 * np.pi * df_temp['dia_del_anio']/365)
    df_temp['dia_del_anio_cos'] = np.cos(2 * np.pi * df_temp['dia_del_anio']/365)

    df_temp['dia_embarque'] = df_temp['fecha_embarque'].dt.day
    df_temp['dia_embarque_sin'] = np.sin(2 * np.pi * df_temp['dia_embarque']/31)
    df_temp['dia_embarque_cos'] = np.cos(2 * np.pi * df_temp['dia_embarque']/31)

    df_temp['mes_embarque'] = df_temp['fecha_embarque'].dt.month
    df_temp['mes_embarque_sin'] = np.sin(2 * np.pi * df_temp['mes_embarque']/12)
    df_temp['mes_embarque_cos'] = np.cos(2 * np.pi * df_temp['mes_embarque']/12)

    df_temp['week_of_year'] = df_temp['fecha_embarque'].dt.isocalendar().week
    df_temp['week_of_year_sin'] = np.sin(2 * np.pi * df_temp['week_of_year']/52)
    df_temp['week_of_year_cos'] = np.cos(2 * np.pi * df_temp['week_of_year']/52)

    if df_temp['weekday_sin'].isna().any() or df_temp['weekday_cos'].isna().any():
        df_temp['weekday'] = df_temp['fecha_embarque'].dt.weekday
        df_temp['weekday_sin'] = np.sin(2 * np.pi * df_temp['weekday']/7)
        df_temp['weekday_cos'] = np.cos(2 * np.pi * df_temp['weekday']/7)
        df_temp = df_temp.drop(columns=['weekday'])

    df_temp = df_temp.drop(columns=['dia_del_anio', 'dia_embarque', 'mes_embarque', 'week_of_year'])

    df_final = pd.merge(
        df_temp,
        df_daily,
        on='fecha_embarque',
        how='left'
    ).fillna({nombre_target: 0})

    df_final = df_final.sort_values('fecha_embarque')

    diferencia = np.diff(df_final['fecha_embarque'].values)
    assert all(d == np.timedelta64(1, 'D') for d in diferencia), "Error: Aun hay huecos temporales"

    scaler = MinMaxScaler()
    df_final[f'{nombre_target}_norm'] = scaler.fit_transform(df_final[[nombre_target]])

    df_final = df_final.fillna(0)

    return df_final, scaler

In [None]:
df_pasajeros = df[df["tipo_agrupacion"] == 1].copy()
df_vehiculos = df[df["tipo_agrupacion"] == 0].copy()

df_pasajeros_daily, scaler_pasajeros = procesar_dataset_completo(df, df_pasajeros, 'total_pasajeros')
df_vehiculos_daily, scaler_vehiculos = procesar_dataset_completo(df, df_vehiculos, 'total_vehiculos')

In [None]:
def crear_secuencias(df, target_col, lookback=7):
    features = df.columns.difference(['fecha_embarque', target_col, f'{target_col}_norm'])
    X, y = [], []
    for i in range(lookback, len(df)):
        X_seq = df.iloc[i-lookback:i][features].values.astype('float32')
        X.append(X_seq)
        y.append(df.iloc[i][f'{target_col}_norm'])
    return np.array(X, dtype='float32'), np.array(y, dtype='float32')

LOOKBACK = 7
X_pasajeros, y_pasajeros = crear_secuencias(df_pasajeros_daily, 'total_pasajeros', LOOKBACK)
X_vehiculos, y_vehiculos = crear_secuencias(df_vehiculos_daily, 'total_vehiculos', LOOKBACK)

print("Tipo de X_pasajeros:", X_pasajeros.dtype)

In [None]:
def split_temporal(X, y, ratios=(0.7, 0.15, 0.15)):
    train_end = int(len(X) * ratios[0])
    val_end = train_end + int(len(X) * ratios[1])
    return (X[:train_end], y[:train_end]), (X[train_end:val_end], y[train_end:val_end]), (X[val_end:], y[val_end:])

(train_p, val_p, test_p) = split_temporal(X_pasajeros, y_pasajeros)
(train_v, val_v, test_v) = split_temporal(X_vehiculos, y_vehiculos)

In [None]:
print("\n=== Verificacion de Integridad ===")
print(f"Fechas unicas en original: {len(df['fecha_embarque'].unique())}")
print(f"Fechas en pasajeros: {len(df_pasajeros_daily)} (deberian ser iguales)")
print(f"Fechas en vehiculos: {len(df_vehiculos_daily)} (deberian ser iguales)")

def print_date_ranges(df_daily, split_points, name=""):
    print(f"\n=== Rangos temporales {name} ===")
    print(f"Train: {df_daily['fecha_embarque'].iloc[0].date()} a {df_daily['fecha_embarque'].iloc[split_points[0]-1].date()}")
    print(f"Val: {df_daily['fecha_embarque'].iloc[split_points[0]].date()} a {df_daily['fecha_embarque'].iloc[split_points[1]-1].date()}")
    print(f"Test: {df_daily['fecha_embarque'].iloc[split_points[1]].date()} a {df_daily['fecha_embarque'].iloc[-1].date()}")

train_end_p = int(len(X_pasajeros) * 0.7)
val_end_p = train_end_p + int(len(X_pasajeros) * 0.15)
print_date_ranges(df_pasajeros_daily, [train_end_p, val_end_p], "Pasajeros")

train_end_v = int(len(X_vehiculos) * 0.7)
val_end_v = train_end_v + int(len(X_vehiculos) * 0.15)
print_date_ranges(df_vehiculos_daily, [train_end_v, val_end_v], "Vehiculos")

def check_temporal_continuity(df_daily, train_end, val_end):
    gap_train_val = (df_daily['fecha_embarque'].iloc[train_end] - df_daily['fecha_embarque'].iloc[train_end-1]).days
    gap_val_test = (df_daily['fecha_embarque'].iloc[val_end] - df_daily['fecha_embarque'].iloc[val_end-1]).days
    print(f"\nSeparación Train-Val: {gap_train_val} días (deberia ser 1 o 0)")
    print(f"Separación Val-Test: {gap_val_test} días (deberia ser 1 o 0)")

check_temporal_continuity(df_pasajeros_daily, train_end_p, val_end_p)

print("\n=== Shapes finales ===")
print(f"Pasajeros - X: {X_pasajeros.shape}, y: {y_pasajeros.shape}")
print(f"Vehículos - X: {X_vehiculos.shape}, y: {y_vehiculos.shape}")

def check_normalization(y_train, y_val, y_test):
    print("\n=== Verificacion de Normalizacion ===")
    print(f"Train Y - Min: {y_train.min():.2f}, Max: {y_train.max():.2f}")
    print(f"Val Y - Min: {y_val.min():.2f}, Max: {y_val.max():.2f}")
    print(f"Test Y - Min: {y_test.min():.2f}, Max: {y_test.max():.2f}")
    assert all(0.0 <= y <= 1.0 for y in y_train), "Error: Train Y no esta normalizado"
    assert all(0.0 <= y <= 1.0 for y in y_val), "Error: Val Y no est normalizado"

check_normalization(train_p[1], val_p[1], test_p[1])

print("\nProceso completado Dataset listo para el modelo LSTM.")

def check_sequence_shapes(X, y, lookback=7, n_features=33):  
    print(f"Shape de X: {X.shape} (esperado: (n_samples, {lookback}, {n_features}))")
    print(f"Shape de y: {y.shape} (esperado: (n_samples,))")

    assert X.shape[2] == n_features, f"Error: X tiene {X.shape[2]} features, pero se esperaban {n_features}"

check_sequence_shapes(X_pasajeros, y_pasajeros)

def check_lookback_coverage(df_daily, train_end, val_end, lookback=7):
    first_val_date = df_daily['fecha_embarque'].iloc[train_end]
    start_date = first_val_date - pd.Timedelta(days=lookback)
    required_train_dates = pd.date_range(start=start_date, end=first_val_date - pd.Timedelta(days=1))
    missing_dates = [d for d in required_train_dates if d not in df_daily['fecha_embarque'].iloc[:train_end].values]

    if not missing_dates:
        print("Lookback correcto: Val comienza con contexto suficiente")
    else:
        print(f"Error: Faltan {len(missing_dates)} dias para el lookback de Val")
check_lookback_coverage(df_pasajeros_daily, train_end_p, val_end_p)


# **Aqui empieza la parte de generacion del modelo**


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def crear_modelo_lstm(input_shape, dropout_rate=0.1, lstm_units=64, bidirectional=True):
    model = Sequential()
    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=input_shape))
    else:
        model.add(LSTM(lstm_units, return_sequences=True, input_shape=input_shape))

    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_units // 2)))
    else:
        model.add(LSTM(lstm_units // 2))

    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    model.add(Dense(64, activation='relu'))  
    model.add(BatchNormalization())

    model.add(Dense(1, activation='sigmoid'))

    return model

In [None]:
def compilar_modelo(model):
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=['mae']
    )

    return model
input_shape = (train_p[0].shape[1], train_p[0].shape[2]) 
print(f"Input shape para el modelo: {input_shape}")

modelo = crear_modelo_lstm(input_shape)

modelo = compilar_modelo(modelo)

modelo.summary()

In [None]:
def crear_callbacks(model_name):

    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )

    checkpoint = ModelCheckpoint(
        f'/content/drive/MyDrive/3 - Master Computacion y Sistemas Inteligentes/PFM/Modelos/{model_name}.h5',
        monitor='val_loss',
        save_best_only=True,
        verbose=1
    )

    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.2,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )

    return [early_stopping, checkpoint, reduce_lr]

callbacks = crear_callbacks("lstm_model_pasajeros")
print("NaN en train_p[0]:", np.isnan(train_p[0]).any())
print("Forma de train_p[0]:", train_p[0].shape)  

In [None]:
nan_mask = np.isnan(train_p[0])
print("Shape de nan_mask:", nan_mask.shape)
print("Numero total de NaN:", nan_mask.sum())

for i in range(train_p[0].shape[0]):
    if np.isnan(train_p[0][i]).any():
        print(f"Secuencia {i} contiene NaN")
        print(train_p[0][i])
        break

In [None]:
print("Tipo de datos de train_p[0]:", train_p[0].dtype)
print("Tipo de datos de train_p[1]:", train_p[1].dtype)
print("Tipo de datos de val_p[0]:", val_p[0].dtype)
print("Tipo de datos de val_p[1]:", val_p[1].dtype)

In [None]:
history = modelo.fit(
    train_p[0], train_p[1],
    validation_data=(val_p[0], val_p[1]),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

import pickle
with open('/content/drive/MyDrive/3 - Master Computacion y Sistemas Inteligentes/PFM/Modelos/history_lstm_pasajeros.pkl', 'wb') as f:
    pickle.dump(history.history, f)

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Perdida durante entrenamiento - Modelo LSTM')
plt.xlabel('Época')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train')
plt.plot(history.history['val_mae'], label='Validation')
plt.title('MAE durante entrenamiento - Modelo LSTM')
plt.xlabel('Epoca')
plt.ylabel('MAE')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
y_pred_test_norm = modelo.predict(test_p[0])

y_pred_test = scaler_pasajeros.inverse_transform(y_pred_test_norm.reshape(-1, 1)).flatten()
y_test_real = scaler_pasajeros.inverse_transform(test_p[1].reshape(-1, 1)).flatten()

import matplotlib.pyplot as plt

plt.figure(figsize=(15, 5))
plt.plot(y_test_real, label='Real', linewidth=2)
plt.plot(y_pred_test, label='Predicho', linestyle='--')
plt.title('Prediccion vs Real - Conjunto de Test')
plt.xlabel('Dias')
plt.ylabel('Total Pasajeros')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Aqui empieza la Evaluación del modelo:

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
from math import sqrt

In [None]:
def calculate_metrics(y_true, y_pred):

    rmse = sqrt(mean_squared_error(y_true, y_pred))

    mae = mean_absolute_error(y_true, y_pred)

    epsilon = 1e-10
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100

    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    r2 = 1 - (ss_residual / ss_total)

    return {
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'R²': r2
    }

In [None]:
metrics = calculate_metrics(y_test_real, y_pred_test)

print("\n===== Model Performance Metrics =====")
for metric_name, metric_value in metrics.items():
    print(f"{metric_name}: {metric_value:.4f}")

In [None]:
def plot_predictions(y_true, y_pred, title="Predictions vs Actual Values",
                     dates=None, window_size=None):

    plt.figure(figsize=(16, 8))

    if window_size:
        if len(y_true) > window_size:
            start_idx = len(y_true) - window_size
            y_true = y_true[start_idx:]
            y_pred = y_pred[start_idx:]
            if dates is not None:
                dates = dates[start_idx:]

    if dates is not None:
        plt.plot(dates, y_true, 'b-', label='Actual', linewidth=2)
        plt.plot(dates, y_pred, 'r--', label='Predicted', linewidth=2)
        plt.gcf().autofmt_xdate()
        plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=2))
    else:
        plt.plot(y_true, 'b-', label='Actual', linewidth=2)
        plt.plot(y_pred, 'r--', label='Predicted', linewidth=2)

    metrics_text = "\n".join([f"{k}: {v:.4f}" for k, v in metrics.items()])
    plt.annotate(metrics_text, xy=(0.02, 0.85), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.5", fc="lightyellow", alpha=0.8))

    plt.title(title, fontsize=16)
    plt.xlabel('Time', fontsize=12)
    plt.ylabel('Passengers', fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(loc='best', fontsize=12)
    plt.tight_layout()

    plt.figtext(0.5, 0.01, f"Model Performance: RMSE={metrics['RMSE']:.2f}, MAE={metrics['MAE']:.2f}, MAPE={metrics['MAPE']:.2f}%",
                ha="center", fontsize=12, bbox={"facecolor":"orange", "alpha":0.2, "pad":5})

    return plt

test_dates = df_pasajeros_daily['fecha_embarque'].iloc[-len(y_test_real):].values

In [None]:
plot_predictions(y_test_real, y_pred_test,
                title="LSTM Model: Passenger Predictions vs Actual Values (Full Test Set)",
                dates=test_dates)
plt.show()

In [None]:
plot_predictions(y_test_real, y_pred_test,
                title="Prediccion de pasajeros vs Valores actuales (30 dias)",
                dates=test_dates, window_size=30)
plt.show()

In [None]:
daily_error = y_test_real - y_pred_test
daily_pct_error = (daily_error / (y_test_real + 1e-10)) * 100  

In [None]:
plt.figure(figsize=(16, 12))

plt.subplot(3, 1, 1)
plt.plot(test_dates, daily_error, 'g-', linewidth=1.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Prediction Error Over Time', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Error (Actual - Predicted)')
plt.grid(True, linestyle='--', alpha=0.7)
plt.gcf().autofmt_xdate()

plt.subplot(3, 1, 2)
plt.hist(daily_error, bins=30, color='teal', alpha=0.7)
plt.axvline(x=0, color='r', linestyle='--')
plt.title('Error Distribution Histogram', fontsize=14)
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.grid(True, linestyle='--', alpha=0.7)

plt.subplot(3, 1, 3)
plt.plot(test_dates, daily_pct_error, 'purple', linewidth=1.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Percentage Error Over Time', fontsize=14)
plt.xlabel('Date')
plt.ylabel('Percentage Error %')
plt.grid(True, linestyle='--', alpha=0.7)
plt.gcf().autofmt_xdate()

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(y_test_real, y_pred_test, alpha=0.6)
plt.plot([y_test_real.min(), y_test_real.max()],
         [y_test_real.min(), y_test_real.max()], 'r--')
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)
plt.title('LSTM Model: Scatter Plot of Predictions vs Actual Values', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
def permutation_importance(model, X, y, n_repeats=10):

    baseline_prediction = model.predict(X)
    baseline_score = mean_squared_error(y, baseline_prediction)

    n_features = X.shape[2]
    feature_names = [f'Feature_{i}' for i in range(n_features)]

    importance_scores = {}

    for feature_idx in range(n_features):
        feature_importance = 0

        for _ in range(n_repeats):
            X_permuted = X.copy()

            for seq_idx in range(X.shape[0]):
                permuted_feature = X_permuted[seq_idx, :, feature_idx].copy()
                np.random.shuffle(permuted_feature)
                X_permuted[seq_idx, :, feature_idx] = permuted_feature

            permuted_prediction = model.predict(X_permuted, verbose=0)
            permuted_score = mean_squared_error(y, permuted_prediction)

            feature_importance += (permuted_score - baseline_score)

        importance_scores[feature_names[feature_idx]] = feature_importance / n_repeats

    return importance_scores

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(y_pred_test, daily_error, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Analysis: Error vs Predicted Value', fontsize=14)
plt.xlabel('Predicted Value', fontsize=12)
plt.ylabel('Error (Actual - Predicted)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
results_df = pd.DataFrame([metrics])
print("\nResumen Model Performance:")
print(results_df)

In [None]:
test_df = pd.DataFrame({
    'date': test_dates,
    'actual': y_test_real,
    'predicted': y_pred_test,
    'error': daily_error,
    'pct_error': daily_pct_error
})

test_df['date'] = pd.to_datetime(test_df['date'])
test_df['weekday'] = test_df['date'].dt.day_name()
test_df['month'] = test_df['date'].dt.month_name()

weekday_metrics = test_df.groupby('weekday').agg({
    'error': ['mean', 'std'],
    'pct_error': ['mean', 'std'],
    'actual': 'mean'
})

monthly_metrics = test_df.groupby('month').agg({
    'error': ['mean', 'std'],
    'pct_error': ['mean', 'std'],
    'actual': 'mean'
})

print("\nError Metrics by Weekday:")
print(weekday_metrics)

print("\nError Metrics by Month:")
print(monthly_metrics)

plt.figure(figsize=(14, 6))
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_data = test_df.groupby('weekday')['pct_error'].mean().reindex(weekday_order)

plt.bar(weekday_data.index, weekday_data.values, alpha=0.7)
plt.title('Average Percentage Error by Weekday', fontsize=14)
plt.xlabel('Weekday', fontsize=12)
plt.ylabel('Mean Percentage Error (%)', fontsize=12)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
prediction_results = pd.DataFrame({
    'date': test_dates,
    'actual': y_test_real,
    'predicted': y_pred_test,
    'error': daily_error,
    'percentage_error': daily_pct_error
})

print("\nEvaluacion completada")