# ARIMA Model Example for Time Series Forecasting

This notebook demonstrates how to use the `arima_analyzer` module to perform time series forecasting using an ARIMA model.
We will use sample monthly sales data.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sys

# Add src directory to Python path to import arima_analyzer
sys.path.append('../src') 
import arima_analyzer as arima

# Configure plots
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Load Data

Load the sample monthly sales data using the `load_data` function from our module.

In [None]:
data_filepath = '../data/sample_monthly_sales.csv'
date_col = 'Month'
value_col = 'Sales'

ts = arima.load_data(data_filepath, date_col, value_col)

if ts is not None:
    print("Data loaded successfully:")
    print(ts.head())
    ts.plot(figsize=(12, 6), title='Monthly Sales Data')
    plt.xlabel('Month')
    plt.ylabel('Sales')
    plt.show()
else:
    print("Failed to load data.")

## 2. Check for Stationarity

We'll use the Augmented Dickey-Fuller (ADF) test to check if the time series is stationary.

In [None]:
if ts is not None:
    print("Checking stationarity of the original series:")
    p_value_orig = arima.check_stationarity(ts)
    
    # Optional: If non-stationary, try differencing (though auto_arima can handle this)
    # if p_value_orig is not None and p_value_orig > 0.05:
    #     print("\nSeries is non-stationary. Let's try differencing once.")
    #     ts_diff, d_val = arima.make_stationary(ts, d=1)
    #     if ts_diff is not None:
    #         print(f"Differenced series (d={d_val}):")
    #         ts_diff.plot(figsize=(12,6), title=f'Differenced Sales Data (d={d_val})')
    #         plt.show()
    #         print("Checking stationarity of the differenced series:")
    #         arima.check_stationarity(ts_diff)
    # else:
    #     print("\nSeries is likely stationary or stationarity check failed.")
else:
    print("Timeseries not loaded, skipping stationarity check.")

## 3. Find Optimal ARIMA Parameters

We'll use `auto_arima` from the `pmdarima` package (wrapped in our module) to find the best (p,d,q) and seasonal (P,D,Q,m) parameters.
For this dataset, we'll assume seasonality with m=12 (monthly data).

In [None]:
optimal_order = None
optimal_seasonal_order = None

if ts is not None:
    # Split data into training and testing sets (e.g., last 12 months for testing)
    train_split_ratio = 0.8
    split_point = int(len(ts) * train_split_ratio)
    
    # For auto_arima, it's often better to use as much data as possible to find parameters,
    # or at least a substantial part of the training set.
    # Here we use the full series to find parameters, then train on the training set.
    # Note: auto_arima can determine 'd' and 'D' itself.
    
    print(f"Finding optimal parameters using data up to index {len(ts)-1} (the full series)...")
    # Parameters for auto_arima:
    # m=12 for monthly seasonality
    # seasonal=True to consider seasonal components
    # D=1 as a starting point for seasonal differencing, auto_arima can adjust this.
    # trace=True to see the fitting process
    optimal_order, optimal_seasonal_order = arima.find_optimal_parameters(
        ts, 
        m=12, 
        seasonal=True, 
        D=1, # Let auto_arima determine D, but good to provide m
        trace=True,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )

    if optimal_order:
        print(f"Optimal ARIMA order (p,d,q): {optimal_order}")
        if optimal_seasonal_order:
            print(f"Optimal Seasonal order (P,D,Q,m): {optimal_seasonal_order}")
    else:
        print("Could not determine optimal parameters automatically. Consider manual analysis or different data.")
else:
    print("Timeseries not loaded, skipping parameter search.")

## 4. Split Data into Training and Testing Sets

We'll use the first part of the data for training the model and the latter part for testing its forecasting accuracy.

In [None]:
train_ts = None
test_ts = None
n_test_periods = 12 # Number of periods to hold out for testing

if ts is not None:
    if len(ts) > n_test_periods:
        train_ts = ts[:-n_test_periods]
        test_ts = ts[-n_test_periods:]
        print(f"Training data length: {len(train_ts)}")
        print(f"Test data length: {len(test_ts)}")
        
        plt.figure(figsize=(12,6))
        train_ts.plot(label='Training Data')
        test_ts.plot(label='Test Data')
        plt.title('Train and Test Split')
        plt.legend()
        plt.show()
    else:
        print(f"Timeseries is too short for a test split of {n_test_periods} periods. Using all data for training and no test set.")
        train_ts = ts # Use all data for training if too short
        # test_ts will remain None
else:
    print("Timeseries not loaded, skipping data split.")

## 5. Fit ARIMA Model

Now we fit the ARIMA model using the training data and the optimal parameters found.

In [None]:
fitted_model = None
if train_ts is not None and optimal_order is not None:
    print(f"Fitting ARIMA with order={optimal_order} and seasonal_order={optimal_seasonal_order}")
    fitted_model = arima.fit_arima_model(train_ts, optimal_order, seasonal_order=optimal_seasonal_order)
    
    if fitted_model:
        print("Model fitting complete.")
        # print(fitted_model.summary()) # Summary is already printed by the function
    else:
        print("Model fitting failed.")
elif train_ts is None:
    print("Training data is not available. Skipping model fitting.")
else:
    print("Optimal parameters not found. Skipping model fitting.")

## 6. Make Forecasts

Using the fitted model, we will forecast the values for the test period (or a few steps ahead if no test set).

In [None]:
forecast_df = None
num_forecast_steps = 0

if fitted_model is not None:
    if test_ts is not None and not test_ts.empty:
        num_forecast_steps = len(test_ts)
        print(f"Forecasting {num_forecast_steps} steps ahead (test period).")
    else:
        num_forecast_steps = 12 # Default forecast steps if no test_ts
        print(f"No test data or test data is empty. Forecasting {num_forecast_steps} steps ahead from end of training data.")
        
    forecast_df = arima.forecast(fitted_model, steps=num_forecast_steps)
    
    if forecast_df is not None:
        print("Forecast generated:")
        print(forecast_df)
    else:
        print("Forecasting failed.")
else:
    print("Fitted model not available. Skipping forecasting.")

## 7. Evaluate Model

If we have a test set and forecasts, we can evaluate the model's performance.

In [None]:
evaluation_results = None
if forecast_df is not None and test_ts is not None and not test_ts.empty:
    actual_values = test_ts
    predicted_values = forecast_df['forecast']
    
    # Ensure indices match for direct comparison if possible, otherwise rely on evaluate_model's internal handling
    if isinstance(actual_values.index, pd.DatetimeIndex) and isinstance(predicted_values.index, pd.DatetimeIndex):
         # Align based on index if both are datetime indexed
        common_index = actual_values.index.intersection(predicted_values.index)
        if not common_index.empty:
            actual_values_aligned = actual_values[common_index]
            predicted_values_aligned = predicted_values[common_index]
            print(f"Evaluating model on {len(common_index)} common time points.")
        else: # If no common index, means forecast dates don't align with test_ts dates
            print("Warning: Test set dates and forecast dates do not align. Evaluation might be on mismatched periods.")
            print(f"Test dates from {test_ts.index.min()} to {test_ts.index.max()}")
            print(f"Forecast dates from {forecast_df.index.min()} to {forecast_df.index.max()}")
            # Fallback to array comparison, assuming order is correct
            actual_values_aligned = actual_values
            predicted_values_aligned = predicted_values.iloc[:len(actual_values)] # Ensure same length
    else: # If not datetime indexed or one is not
        actual_values_aligned = actual_values
        predicted_values_aligned = predicted_values.iloc[:len(actual_values)] # Ensure same length

    if len(actual_values_aligned) == len(predicted_values_aligned):
        evaluation_results = arima.evaluate_model(actual_values_aligned, predicted_values_aligned)
        if evaluation_results:
            print(f"RMSE: {evaluation_results['rmse']:.4f}")
            print(f"MAE: {evaluation_results['mae']:.4f}")
        else:
            print("Model evaluation failed.")
    else:
        print(f"Skipping evaluation: Length of actual values ({len(actual_values_aligned)}) and predicted values ({len(predicted_values_aligned)}) do not match after alignment.")

elif test_ts is None or test_ts.empty:
    print("No test data available for evaluation.")
else:
    print("Forecasts not available. Skipping model evaluation.")

## 8. Visualize Results

Plot the original time series, the fitted values (forecasts on training data), and the out-of-sample forecasts.

In [None]:
if ts is not None and fitted_model is not None and forecast_df is not None:
    plt.figure(figsize=(15, 8))
    
    # Plot original data (train + test)
    ts.plot(label='Original Data', color='blue')
    
    # Plot fitted values (in-sample forecast)
    # The fitted_model.predict() by default starts from the beginning of the training data.
    # We can specify start and end if needed.
    # For SARIMAXResults, fitted_model.fittedvalues gives the in-sample predictions.
    in_sample_forecasts = fitted_model.fittedvalues
    if train_ts is not None: # Plot only if train_ts was defined
        in_sample_forecasts.plot(label='Fitted Values (In-sample)', color='green', linestyle='--')

    # Plot out-of-sample forecasts
    forecast_df['forecast'].plot(label='Out-of-Sample Forecast', color='red')
    
    # Plot confidence intervals for the forecast
    if 'lower_ci' in forecast_df.columns and 'upper_ci' in forecast_df.columns:
        plt.fill_between(forecast_df.index, 
                         forecast_df['lower_ci'], 
                         forecast_df['upper_ci'], 
                         color='pink', alpha=0.3, label='Confidence Interval (95%)')

    plt.title('Time Series Forecasting with ARIMA')
    plt.xlabel('Date')
    plt.ylabel(value_col if 'value_col' in globals() else 'Value')
    plt.legend(loc='upper left')
    plt.grid(True)
    plt.show()
    
elif ts is None:
    print("Original timeseries not loaded. Cannot plot.")
elif fitted_model is None:
    print("Model not fitted. Cannot plot fitted values.")
else: # forecast_df is None
    print("Forecasts not available. Cannot plot full results.")
    # Option to plot just original and fitted if available
    if ts is not None and fitted_model is not None:
        plt.figure(figsize=(15, 8))
        ts.plot(label='Original Data', color='blue')
        if hasattr(fitted_model, 'fittedvalues'):
             fitted_model.fittedvalues.plot(label='Fitted Values (In-sample)', color='green', linestyle='--')
        plt.title('Time Series Data and Fitted Values')
        plt.xlabel('Date')
        plt.ylabel(value_col if 'value_col' in globals() else 'Value')
        plt.legend(loc='upper left')
        plt.grid(True)
        plt.show()


---
End of Example
---