FORECASTING FROM https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets

In [1]:
import pandas as pd

data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])

data.tail(5)

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT
17415,2018-06-26 15:00:00,39.202999,11.392,49.644001,11.929,-10.331,-1.258,47.084999
17416,2018-06-26 16:00:00,38.113998,10.974,48.759998,11.366,-10.331,-1.29,48.183498
17417,2018-06-26 17:00:00,39.622002,10.974,50.609001,11.661,-11.557,-1.418,48.183498
17418,2018-06-26 18:00:00,43.643002,13.403,54.737,13.778,-10.299,-1.418,46.865501
17419,2018-06-26 19:00:00,38.868,10.052,49.859001,10.669,-11.525,-1.418,45.9865


In [2]:
Resultado = {
    'Metrics': ['Seasonality', 'p-value (adfuller)', 'Modelo', 'RMSE', 'MAE', 'RMSE naive', 'MAE naive', 'Min', 'Max', 'alpha (SES)', 'Forecast 07/07 00:00 (24 hrs)'],
    'HUFL': ['Yes', '0.0307', 'Arima', '6.03', '5.38', '3.90', '3.15', '48.08', '30.11', '---', '38.4 ± 3.6'],
    'HULL': ['Yes', '0.0433', 'Arima', '1.88', '1.57', '1.48', '1.08', '16.13', '6.95', '---', '35.5 ± 7.1'],
    'MUFL': ['No', '0.0577', 'Arima', '5.67', '5.09', '3.31', '2.66', '43.99', '27.11', '---', '35.5 ± 4.0'],
    'MULL': ['No', '0.0518', 'Arima', '1.26', '0.96', '1.11', '0.80', '12.64', '5.31', '---', '9.7 ± 1.7'],
    'LUFL': ['Yes', '0.0072', 'Arima', '0.51', '0.44', '0.61', '0.48', '4.45', '0.00', '---', '3.1 ± 1.0'],
    'LULL': ['Yes', '0.0000', 'SES', '0.37', '0.26', '0.42', '0.19', '1.33', '0.00', '0.071', '0.13 ± 0.6'],
    'OT': ['No', '0.8257', 'Arima', '5.53', '4.03', '5.36', '4.06', '25.59', '48.99', '---', '35.5 ± 7.2'],
    'OT-RF': ['No', '0.8257', 'Random Forest', '7.35', '5.26', '7.51', '5.39', '25.59', '48.99', '---', '34.2 ± 10.2'],
    'OT-PR': ['No', '0.8257', 'Prophet', '6.98', '5.82', '7.02', '5.54', '25.59', '48.99', '---', '30.46 ± 1.7']
}

# Create DataFrame
Resultado = pd.DataFrame(Resultado)
Resultado

Unnamed: 0,Metrics,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT,OT-RF,OT-PR
0,Seasonality,Yes,Yes,No,No,Yes,Yes,No,No,No
1,p-value (adfuller),0.0307,0.0433,0.0577,0.0518,0.0072,0.0000,0.8257,0.8257,0.8257
2,Modelo,Arima,Arima,Arima,Arima,Arima,SES,Arima,Random Forest,Prophet
3,RMSE,6.03,1.88,5.67,1.26,0.51,0.37,5.53,7.35,6.98
4,MAE,5.38,1.57,5.09,0.96,0.44,0.26,4.03,5.26,5.82
5,RMSE naive,3.90,1.48,3.31,1.11,0.61,0.42,5.36,7.51,7.02
6,MAE naive,3.15,1.08,2.66,0.80,0.48,0.19,4.06,5.39,5.54
7,Min,48.08,16.13,43.99,12.64,4.45,1.33,25.59,25.59,25.59
8,Max,30.11,6.95,27.11,5.31,0.00,0.00,48.99,48.99,48.99
9,alpha (SES),---,---,---,---,---,0.071,---,---,---


CONCLUSIONS:

1. Easiest Series to Predict: LUFL and LULL are the best, with very low errors and stable data. We can trust their forecasts.

2. Hardest Series to Predict: OT is tricky because its data lacks a clear pattern, and our models don’t do much better than a simple guess (using the last value). It needs more work.

3. Best Models: ARIMA works well for most series, but for LULL, a simpler model (SES) is better. For OT, no model stands out yet.

4. Forecasts: The forecasts for July 7th are reasonable, but OT has more uncertainty (a wider error range). We can use them, but be cautious with OT.

5. One-Week Challenge: Predicting a full week with just one or two days of data is tough. To improve, we need more data or better methods.

Next Steps: 

-Keep using the current models for LUFL and LULL—they’re working fine.

-For OT, find more data or try new ideas to make it better.

-If we want longer forecasts, add more days of data.

In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
ot_data = data_2h[['OT']].dropna()

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(ot_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = ot_data['OT'][:train_size]
test = ot_data['OT'][train_size:]

# 3. Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]  # Return p-value

# Get p-value from first test
p_value = dickey_fuller_test(train, "OT")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]  # 2 days = 24 data points

# Fit ARIMA model (p,d,q) - Usamos (1,1,1) como punto de partida
# Si no es estacionaria, d=1; si es estacionaria, d=0
d = 1 if p_value > 0.05 else 0
model = ARIMA(last_two_days, order=(1, d, 1))
model_fit = model.fit()

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model_fit.forecast(steps=forecast_steps)

# Confidence intervals from ARIMA
forecast_obj = model_fit.get_forecast(steps=forecast_steps)
ci_lower = forecast_obj.conf_int()['lower OT']
ci_upper = forecast_obj.conf_int()['upper OT']

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'OT_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'OT_Historico': last_two_days,
    'OT_Predicho': model_fit.fittedvalues
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['OT_Historico'],
    mode='lines',
    name='OT Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['OT_Predicho'][:len(last_two_days)],
    mode='lines',
    name='OT Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['OT_Predicho'],
    mode='lines',
    name='OT Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del OT utilizando ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='OT',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model_fit.forecast(steps=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {ot_data['OT'].max()}")
print(f"Mínimo: {ot_data['OT'].min()}")

Resultados de la prueba de Dickey-Fuller para columna: OT
Test Statistic                        -0.777276
p-value                                0.825706
No Lags Used                           6.000000
Número de observaciones utilizadas    60.000000
Critical Value (1%)                   -3.544369
Critical Value (5%)                   -2.911073
Critical Value (10%)                  -2.593190
dtype: float64
Conclusión: Los datos no son estacionarios


RMSE on test set: 5.526273931834615
MAE on test set: 4.0251221380377284
Naive Forecast RMSE: 5.359593640549082
Naive Forecast MAE: 4.064926035263959
Máximo: 48.98900032043457
Mínimo: 25.588250160217285


In [4]:
import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
ot_data = data_2h[['OT']].dropna()

# Preparar datos para Prophet (requiere columnas 'ds' y 'y')
ot_data_prophet = ot_data.reset_index()
ot_data_prophet.columns = ['ds', 'y']

# Dividir en entrenamiento y prueba
train_size = int(len(ot_data_prophet) * 0.8)
train, test = ot_data_prophet.iloc[:train_size], ot_data_prophet.iloc[train_size:]

# Entrenar el modelo de Prophet
model = Prophet()
model.fit(train)

# Pronosticar en el conjunto de prueba
future = model.make_future_dataframe(periods=len(test), freq='2H')
forecast = model.predict(future)

# Filtrar las predicciones para el conjunto de prueba
forecast_test = forecast.iloc[-len(test):]

# Calcular métricas de error en el conjunto de prueba
rmse_prophet = sqrt(mean_squared_error(test['y'], forecast_test['yhat']))
mae_prophet = mean_absolute_error(test['y'], forecast_test['yhat'])

# Pronosticar una semana (84 puntos cada 2 horas)
future_1week = model.make_future_dataframe(periods=84, freq='2H')
forecast_1week = model.predict(future_1week)

# Filtrar las predicciones para la próxima semana
forecast_1week_filtered = forecast_1week.iloc[-84:]

# Crear un DataFrame con los pronósticos
forecast_df = forecast_1week_filtered[['ds', 'yhat']].rename(columns={'ds': 'Fecha', 'yhat': 'OT_Predicho_Prophet'}).set_index('Fecha')

# Mostrar los pronósticos
print(forecast_df)

# Comparar métricas con ARIMA, Random Forest y Naive
print(f"Prophet RMSE en prueba: {rmse_prophet}")
print(f"Prophet MAE en prueba: {mae_prophet}")

17:26:30 - cmdstanpy - INFO - Chain [1] start processing
17:26:35 - cmdstanpy - INFO - Chain [1] done processing


                     OT_Predicho_Prophet
Fecha                                   
2016-07-06 14:00:00            34.947085
2016-07-06 16:00:00            35.071839
2016-07-06 18:00:00            35.418690
2016-07-06 20:00:00            34.544274
2016-07-06 22:00:00            34.257408
...                                  ...
2016-07-13 04:00:00            43.161341
2016-07-13 06:00:00            43.139374
2016-07-13 08:00:00            43.374546
2016-07-13 10:00:00            44.292462
2016-07-13 12:00:00            45.583453

[84 rows x 1 columns]
Prophet RMSE en prueba: 4.807041792528592
Prophet MAE en prueba: 3.31974775067659


In [5]:
import numpy as np
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import plotly.graph_objects as go
from datetime import datetime

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)  # Limitamos a 168 filas originales
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
ot_data = data_2h[['OT']].dropna()

# Crear un nuevo índice de fechas comenzando el 04/07/2024 a las 14:00
start_date = pd.Timestamp('2024-07-04 14:00:00')
new_dates = pd.date_range(start=start_date, periods=len(ot_data), freq='2H')
ot_data.index = new_dates

# Preparar datos para Prophet (requiere columnas 'ds' y 'y')
ot_data_prophet = ot_data.reset_index()
ot_data_prophet.columns = ['ds', 'y']

# Tomar los primeros 24 puntos como datos históricos (48 horas)
train_size = 24
train = ot_data_prophet.iloc[:train_size]

# Entrenar el modelo de Prophet
model = Prophet()
model.fit(train)

# Pronosticar desde el final de los datos históricos (24 puntos) hasta completar el resto
future_periods = len(ot_data_prophet) - train_size + 84  # Resto de datos + 1 semana (84 puntos)
future = model.make_future_dataframe(periods=future_periods, freq='2H')
forecast = model.predict(future)

# Filtrar las predicciones: datos históricos (train) y pronóstico futuro
forecast_historical = forecast.iloc[:train_size]  # Predicciones sobre datos históricos
forecast_future = forecast.iloc[train_size:]      # Pronóstico futuro

# Calcular métricas de error para Prophet en el conjunto de datos no entrenados
test = ot_data_prophet.iloc[train_size:]
forecast_test = forecast.iloc[train_size:train_size + len(test)]
rmse_prophet = sqrt(mean_squared_error(test['y'], forecast_test['yhat']))
mae_prophet = mean_absolute_error(test['y'], forecast_test['yhat'])

# Calcular pronóstico Naive (último valor histórico repetido)
naive_forecast = np.full(len(test), train['y'].iloc[-1])
rmse_naive = sqrt(mean_squared_error(test['y'], naive_forecast))
mae_naive = mean_absolute_error(test['y'], naive_forecast)

# Filtrar el pronóstico de una semana desde el final de los datos originales
forecast_1week = forecast.iloc[-84:]
forecast_df = forecast_1week[['ds', 'yhat']].rename(columns={'ds': 'Fecha', 'yhat': 'OT_Predicho_Prophet'}).set_index('Fecha')

# --- Visualización con Plotly ---
fig = go.Figure()

# Datos históricos (24 puntos desde 04/07 14:00)
fig.add_trace(go.Scatter(
    x=train['ds'],
    y=train['y'],
    mode='lines',
    name='OT Histórico',
    line=dict(color='blue')
))

# Predicciones sobre datos históricos
fig.add_trace(go.Scatter(
    x=forecast_historical['ds'],
    y=forecast_historical['yhat'],
    mode='lines',
    name='OT Predicho (Histórico)',
    line=dict(color='green', dash='dash')
))

# Pronóstico futuro (resto + 1 semana)
fig.add_trace(go.Scatter(
    x=forecast_future['ds'],
    y=forecast_future['yhat'],
    mode='lines',
    name='OT Predicho (Futuro)',
    line=dict(color='red')
))

# Intervalos de confianza (95%) para el pronóstico futuro
fig.add_trace(go.Scatter(
    x=forecast_future['ds'],
    y=forecast_future['yhat_upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_future['ds'],
    y=forecast_future['yhat_lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',  # Rellena el área entre los límites
    fillcolor='rgba(200, 100, 200, 0.1)'  # Color del relleno con transparencia
))



# Personalizar la gráfica
fig.update_layout(
    title='Predicción de OT usando Prophet con IC',
    xaxis_title='Fecha',
    yaxis_title='OT',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Mostrar la gráfica
fig.show()

# Mostrar los pronósticos de la próxima semana
print("\n--- Pronósticos Prophet para OT (1 semana) ---")
print(forecast_df)

# Comparar métricas
print("\n--- Métricas de error en datos no entrenados ---")
print(f"Prophet RMSE: {rmse_prophet}")
print(f"Prophet MAE: {mae_prophet}")
print(f"Naive RMSE: {rmse_naive}")
print(f"Naive MAE: {mae_naive}")

17:26:39 - cmdstanpy - INFO - Chain [1] start processing
17:26:40 - cmdstanpy - INFO - Chain [1] done processing



--- Pronósticos Prophet para OT (1 semana) ---
                     OT_Predicho_Prophet
Fecha                                   
2024-07-11 14:00:00            48.857258
2024-07-11 16:00:00            49.191615
2024-07-11 18:00:00            49.525973
2024-07-11 20:00:00            49.860331
2024-07-11 22:00:00            50.194688
...                                  ...
2024-07-18 04:00:00            75.271506
2024-07-18 06:00:00            75.605863
2024-07-18 08:00:00            75.940221
2024-07-18 10:00:00            76.274578
2024-07-18 12:00:00            76.608936

[84 rows x 1 columns]

--- Métricas de error en datos no entrenados ---
Prophet RMSE: 6.895055553016254
Prophet MAE: 5.8230730073275065
Naive RMSE: 7.024041346199312
Naive MAE: 5.549870872497559


In [6]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['HUFL']].dropna()  # Usamos HUFL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['HUFL'][:train_size]
test = hull_data['HUFL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "HUFL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'HUFL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'HUFL_Historico': last_two_days,
    'HUFL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['HUFL_Historico'],
    mode='lines',
    name='HUFL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['HUFL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='HUFL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['HUFL_Predicho'],
    mode='lines',
    name='HUFL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del HUFL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='HUFL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['HUFL'].max()}")
print(f"Mínimo: {hull_data['HUFL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: HUFL
Test Statistic                        -3.046248
p-value                                0.030795
No Lags Used                           0.000000
Número de observaciones utilizadas    66.000000
Critical Value (1%)                   -3.533560
Critical Value (5%)                   -2.906444
Critical Value (10%)                  -2.590724
dtype: float64
Conclusión: Los datos son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=245.245, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=inf, Time=0.10 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.19 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=110.342, Time=0.14 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=inf, Time=0.69 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.47 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=inf, Time=0.21 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.12 sec
 ARIMA(2,0

RMSE on test set: 6.0298914557522405
MAE on test set: 5.377264412847701
Naive Forecast RMSE: 3.8989387665752018
Naive Forecast MAE: 3.146235858692842
Máximo: 48.08249855041504
Mínimo: 30.114500045776367
Orden ARIMA seleccionado: (1, 0, 0)


In [7]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['HULL']].dropna()  # Usamos HULL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['HULL'][:train_size]
test = hull_data['HULL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "HULL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'HULL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'HULL_Historico': last_two_days,
    'HULL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['HULL_Historico'],
    mode='lines',
    name='HULL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['HULL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='HULL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['HULL_Predicho'],
    mode='lines',
    name='HULL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del HULL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='HULL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['HULL'].max()}")
print(f"Mínimo: {hull_data['HULL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: HULL
Test Statistic                        -2.917836
p-value                                0.043313
No Lags Used                           0.000000
Número de observaciones utilizadas    66.000000
Critical Value (1%)                   -3.533560
Critical Value (5%)                   -2.906444
Critical Value (10%)                  -2.590724
dtype: float64
Conclusión: Los datos son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=188.664, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=inf, Time=0.09 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.09 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=77.857, Time=0.19 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=79.634, Time=0.19 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.26 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=147.384, Time=0.17 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.07 sec
 ARI

RMSE on test set: 1.8817469605811574
MAE on test set: 1.5673824656882795
Naive Forecast RMSE: 1.4805364415505442
Naive Forecast MAE: 1.0768823062672335
Máximo: 16.125499725341797
Mínimo: 6.952499866485596
Orden ARIMA seleccionado: (0, 0, 2)


In [8]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['MUFL']].dropna()  # Usamos MUFL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['MUFL'][:train_size]
test = hull_data['MUFL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "MUFL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'MUFL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'MUFL_Historico': last_two_days,
    'MUFL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['MUFL_Historico'],
    mode='lines',
    name='MUFL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['MUFL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='MUFL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['MUFL_Predicho'],
    mode='lines',
    name='MUFL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del MUFL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='MUFL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['MUFL'].max()}")
print(f"Mínimo: {hull_data['MUFL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: MUFL
Test Statistic                        -2.803591
p-value                                0.057763
No Lags Used                           0.000000
Número de observaciones utilizadas    66.000000
Critical Value (1%)                   -3.533560
Critical Value (5%)                   -2.906444
Critical Value (10%)                  -2.590724
dtype: float64
Conclusión: Los datos no son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=241.405, Time=0.05 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=inf, Time=0.38 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.48 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=113.226, Time=0.18 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=inf, Time=0.53 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.44 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=inf, Time=0.37 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.09 sec
 ARIMA(

RMSE on test set: 5.6682848837493
MAE on test set: 5.085785818440292
Naive Forecast RMSE: 3.313182758253987
Naive Forecast MAE: 2.6640883053050324
Máximo: 43.98850059509277
Mínimo: 27.113999366760254
Orden ARIMA seleccionado: (1, 0, 0)


In [9]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['MULL']].dropna()  # Usamos MULL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['MULL'][:train_size]
test = hull_data['MULL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "MULL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'MULL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'MULL_Historico': last_two_days,
    'MULL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['MULL_Historico'],
    mode='lines',
    name='MULL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['MULL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='MULL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['MULL_Predicho'],
    mode='lines',
    name='MULL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del MULL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='MULL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['MULL'].max()}")
print(f"Mínimo: {hull_data['MULL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: MULL
Test Statistic                        -2.847443
p-value                                0.051810
No Lags Used                           0.000000
Número de observaciones utilizadas    66.000000
Critical Value (1%)                   -3.533560
Critical Value (5%)                   -2.906444
Critical Value (10%)                  -2.590724
dtype: float64
Conclusión: Los datos no son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=179.370, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=inf, Time=0.07 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.16 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=74.920, Time=0.13 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=76.504, Time=0.11 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=inf, Time=0.29 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=139.287, Time=0.14 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.06 sec
 

RMSE on test set: 1.2604189916529889
MAE on test set: 0.9609899365682194
Naive Forecast RMSE: 1.1077229151357082
Naive Forecast MAE: 0.7970882443820729
Máximo: 12.638999938964844
Mínimo: 5.307500123977661
Orden ARIMA seleccionado: (1, 0, 0)


In [10]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['LUFL']].dropna()  # Usamos LUFL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['LUFL'][:train_size]
test = hull_data['LUFL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "LUFL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'LUFL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'LUFL_Historico': last_two_days,
    'LUFL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['LUFL_Historico'],
    mode='lines',
    name='LUFL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['LUFL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='LUFL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['LUFL_Predicho'],
    mode='lines',
    name='LUFL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del LUFL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='LUFL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['LUFL'].max()}")
print(f"Mínimo: {hull_data['LUFL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: LUFL
Test Statistic                        -3.529044
p-value                                0.007272
No Lags Used                           0.000000
Número de observaciones utilizadas    66.000000
Critical Value (1%)                   -3.533560
Critical Value (5%)                   -2.906444
Critical Value (10%)                  -2.590724
dtype: float64
Conclusión: Los datos son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=124.241, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=51.789, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=inf, Time=0.10 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=52.887, Time=0.10 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=52.951, Time=0.06 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=54.865, Time=0.11 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=44.198, Time=0.08 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=47.000, Time=0.04 

RMSE on test set: 0.5146632692527527
MAE on test set: 0.4418378793228457
Naive Forecast RMSE: 0.612142233890129
Naive Forecast MAE: 0.4763235134236953
Máximo: 4.450999975204468
Mínimo: 0.0
Orden ARIMA seleccionado: (3, 0, 1)


In [11]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from pmdarima import auto_arima  # Importamos auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['LULL']].dropna()  # Usamos LULL como en el código anterior

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['LULL'][:train_size]
test = hull_data['LULL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "LULL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit auto_arima model - Optimiza automáticamente p, d, q
model = auto_arima(last_two_days, 
                   seasonal=False,  # No asumimos estacionalidad
                   start_p=0, start_q=0, max_p=5, max_q=5,  # Límites para p y q
                   d=None,  # Deja que auto_arima determine d, pero sabemos que es estacionaria
                   trace=True,  # Muestra el proceso de selección
                   error_action='ignore', 
                   suppress_warnings=True)
model_fit = model.fit(last_two_days)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model.predict(n_periods=forecast_steps)

# Confidence intervals from the fitted model
forecast_obj = model_fit.predict(n_periods=forecast_steps, return_conf_int=True)
forecast, conf_int = forecast_obj[0], forecast_obj[1]
ci_lower = conf_int[:, 0]  # Límite inferior
ci_upper = conf_int[:, 1]  # Límite superior

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'LULL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'LULL_Historico': last_two_days,
    'LULL_Predicho': model.predict_in_sample()  # Predicciones en la muestra
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['LULL_Historico'],
    mode='lines',
    name='LULL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['LULL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='LULL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['LULL_Predicho'],
    mode='lines',
    name='LULL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del LULL utilizando Auto-ARIMA con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='LULL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model.predict(n_periods=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['LULL'].max()}")
print(f"Mínimo: {hull_data['LULL'].min()}")

# Mostrar el orden seleccionado por auto_arima
print(f"Orden ARIMA seleccionado: {model.order}")

Resultados de la prueba de Dickey-Fuller para columna: LULL
Test Statistic                        -4.688012
p-value                                0.000089
No Lags Used                           1.000000
Número de observaciones utilizadas    65.000000
Critical Value (1%)                   -3.535217
Critical Value (5%)                   -2.907154
Critical Value (10%)                  -2.591103
dtype: float64
Conclusión: Los datos son estacionarios
Performing stepwise search to minimize aic
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=21.945, Time=0.04 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=23.572, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=23.689, Time=0.05 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=inf, Time=0.42 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=18.991, Time=0.07 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=20.828, Time=0.09 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=20.842, Time=0.09 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=22.771, Time=0.39 s

RMSE on test set: 0.37231282896641743
MAE on test set: 0.272882352101779
Naive Forecast RMSE: 0.41643249796880566
Naive Forecast MAE: 0.1890588227440329
Máximo: 1.3269999623298645
Mínimo: 0.0
Orden ARIMA seleccionado: (0, 0, 0)


In [12]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

# Cargar datos
data = pd.read_csv("https://huggingface.co/datasets/pkr7098/time-series-forecasting-datasets/resolve/main/ETTh2.csv")
data['date'] = pd.to_datetime(data['date'])
data = data.head(168)
data.set_index('date', inplace=True)
data_2h = data.resample('2H').mean()
hull_data = data_2h[['LULL']].dropna()  # Usamos LULL

# Split the data into training and test sets (80% train, 20% test)
total_rows = len(hull_data)
train_size = int(total_rows * 0.8)
test_size = total_rows - train_size
train = hull_data['LULL'][:train_size]
test = hull_data['LULL'][train_size:]

# Stationarity Testing (Dickey-Fuller)
def dickey_fuller_test(series, column_name):
    print(f'Resultados de la prueba de Dickey-Fuller para columna: {column_name}')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', 'No Lags Used', 'Número de observaciones utilizadas'])
    for key, value in dftest[4].items():
        dfoutput[f'Critical Value ({key})'] = value
    print(dfoutput)
    if dftest[1] <= 0.05:
        print("Conclusión: Los datos son estacionarios")
    else:
        print("Conclusión: Los datos no son estacionarios")
    return dftest[1]

# Get p-value from test
p_value = dickey_fuller_test(train, "LULL")

# Use the last two days (24 points at 2-hour intervals) for modeling
last_two_days = train[-24:]

# Fit Simple Exponential Smoothing model
model = SimpleExpSmoothing(last_two_days)
model_fit = model.fit(optimized=True)  # Optimiza automáticamente el parámetro de suavizado (alpha)

# Forecast for one week (7 days = 7 * 12 = 84 data points at 2-hour intervals)
forecast_steps = 84
forecast = model_fit.forecast(steps=forecast_steps)

# Confidence intervals (aproximación manual, SES no los proporciona directamente)
fitted_values = model_fit.fittedvalues
errors = last_two_days - fitted_values
std_errors = np.std(errors)
ci_lower = forecast - 1.96 * std_errors  # 95% IC
ci_upper = forecast + 1.96 * std_errors

# Create a date range for the forecast
last_date = last_two_days.index[-1]
forecast_dates = pd.date_range(start=last_date + timedelta(hours=2), periods=forecast_steps, freq='2H')

# Create a DataFrame for the forecast with confidence intervals
forecast_ci_df = pd.DataFrame({
    'LULL_Predicho': forecast,
    'CI_Lower': ci_lower,
    'CI_Upper': ci_upper
}, index=forecast_dates)

# Combine historical and forecast data
historical_pred_df = pd.DataFrame({
    'LULL_Historico': last_two_days,
    'LULL_Predicho': model_fit.fittedvalues
})

combined_df = pd.concat([historical_pred_df, forecast_ci_df], axis=0)

# Visualize the results
fig = go.Figure()

# Add historical values
fig.add_trace(go.Scatter(
    x=combined_df.index,
    y=combined_df['LULL_Historico'],
    mode='lines',
    name='LULL Histórico',
    line=dict(color='blue')
))

# Add predicted values for historical data
fig.add_trace(go.Scatter(
    x=combined_df.index[:len(last_two_days)],
    y=combined_df['LULL_Predicho'][:len(last_two_days)],
    mode='lines',
    name='LULL Predicho (Actual)',
    line=dict(color='green', dash='dash')
))

# Add future forecast
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['LULL_Predicho'],
    mode='lines',
    name='LULL Predicho (Futuro)',
    line=dict(color='red')
))

# Add 95% confidence interval
fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Upper'],
    mode='lines',
    name='Límite Superior (95% IC)',
    line=dict(color='gray', width=0.5)
))

fig.add_trace(go.Scatter(
    x=forecast_ci_df.index,
    y=forecast_ci_df['CI_Lower'],
    mode='lines',
    name='Límite Inferior (95% IC)',
    line=dict(color='gray', width=0.5),
    fill='tonexty',
    fillcolor='rgba(200, 100, 200, 0.1)'
))

# Customize the plot
fig.update_layout(
    title='Predicción del LULL utilizando Suavizado Exponencial Simple con Intervalo de Confianza del 95%',
    xaxis_title='Fecha',
    yaxis_title='LULL',
    hovermode='x unified',
    template="plotly_dark",
    showlegend=True
)

# Show the plot
fig.show()

# Calculate error metrics on the test set
pred_test = model_fit.forecast(steps=len(test))
rmse = sqrt(mean_squared_error(test, pred_test))
mae = np.mean(np.abs(test - pred_test))
print(f"RMSE on test set: {rmse}")
print(f"MAE on test set: {mae}")

# Naive forecast: Use the last value of the training set for all test predictions
naive_forecast = np.full_like(test, test.iloc[0])

# Calculate error metrics for the naive forecast
rmse_naive = sqrt(mean_squared_error(test, naive_forecast))
mae_naive = np.mean(np.abs(test - naive_forecast))

print(f"Naive Forecast RMSE: {rmse_naive}")
print(f"Naive Forecast MAE: {mae_naive}")

# Print maximum and minimum of the series
print(f"Máximo: {hull_data['LULL'].max()}")
print(f"Mínimo: {hull_data['LULL'].min()}")

# Mostrar el parámetro alpha optimizado
print(f"Parámetro de suavizado (alpha): {model_fit.params['smoothing_level']}")

Resultados de la prueba de Dickey-Fuller para columna: LULL
Test Statistic                        -4.688012
p-value                                0.000089
No Lags Used                           1.000000
Número de observaciones utilizadas    65.000000
Critical Value (1%)                   -3.535217
Critical Value (5%)                   -2.907154
Critical Value (10%)                  -2.591103
dtype: float64
Conclusión: Los datos son estacionarios


RMSE on test set: 0.3749123076234599
MAE on test set: 0.2607057668278973
Naive Forecast RMSE: 0.41643249796880566
Naive Forecast MAE: 0.1890588227440329
Máximo: 1.3269999623298645
Mínimo: 0.0
Parámetro de suavizado (alpha): 0.07074985106811812
