# ARIMA Modeling for Time Series Analysis

This notebook explores the ARIMA model as predictor for the water flow

In [None]:
# Import required libraries
import numpy as np
import hvplot.pandas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from ombs_senegal.season import SeasonalityHandler
# from statsmodels.stats.diagnostic import acf
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime, timedelta
from pathlib import Path
from ombs_senegal.benchmark_model import BenchmarkScores

# Set style for better visualizations
#plt.style.use('seaborn')
sns.set_palette('deep')
DATA_PATH = Path("../../data")

## Load data

Load data for ARIMAX model
1. A target variable (e.g., débit_insitu)
2. Two exogenous variables (e.g., P_cumul_7j, débit_mgb)

In [None]:
data = pd.read_csv(
    DATA_PATH/'data_cumul.csv', 
    sep=';', 
    usecols=['time', 'P_mean', 'P_cumul_7j', 'débit_insitu', 'débit_mgb'], 
    index_col='time',
    converters={"time": pd.to_datetime}
    )
data = data['2012-01-01':]

## ARIMA

First we can check for how long flow data is self correlated

#### Prepare data

In [None]:
col = ['débit_insitu']
data = data[col]

In [None]:
# Split data into train and test sets (80-20 split)
train_mask = data.index < '2019-01-01'
train_data = data[train_mask]
test_data = data[~train_mask]

#### Remove seassonality

In [None]:
seasonality_handler = SeasonalityHandler()
seasonality_handler.compute_seasonal_pattern(train_data)

deseasonalized_train_data = seasonality_handler.remove_seasonality(train_data)
deseasonalized_test_data = seasonality_handler.remove_seasonality(test_data)


#### Normalize data

In [None]:
from sklearn.preprocessing import RobustScaler

arima_data = deseasonalized_train_data[col]
arima_data = arima_data[:"2018-12-31"]
arima_scaler = RobustScaler()
scaled_arima_data = arima_scaler.fit_transform(arima_data)
arima_data = pd.DataFrame(scaled_arima_data, index=arima_data.index, columns=["q_obs"])

#### Finding optimal hyperparameters order of magnitude

In [None]:
# Last found optimal parameters: p=9; d=0; q=2
find_optimal = False
if find_optimal:
    import warnings
    warnings.filterwarnings("ignore")

    model = auto_arima(arima_data["q_obs"], max_p=20, max_q=10, max_d=3, seasonal=False)
    print(model.summary())



#### Cross validation of the model

In [None]:
"""Module for hyperparameter tuning using cross validation."""

from typing import Tuple, List, Dict, Optional
import numpy as np
import pandas as pd
from itertools import product
from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error


"""Module for time series cross validation."""

from typing import Tuple, List, Optional, Union, Callable
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.arima.model import ARIMA
from tqdm import tqdm

class TimeSeriesCrossValidator:
    """Class to handle time series cross validation.
    
    This class implements time series specific cross validation approaches:
    - Rolling window validation
    - Expanding window validation
    - Multiple horizon forecasting
    
    Attributes:
        n_splits: int
            Number of splits for cross validation
        horizon: int
            Forecast horizon (how many steps ahead to predict)
        min_train_size: int
            Minimum size of the training set
        step: int
            Step size between training sets
    """
    
    def __init__(
        self,
        n_splits: int = 5,
        horizon: int = 10,
        min_train_size: Optional[int] = None,
        step: int = 1
    ):
        """Initialize TimeSeriesCrossValidator.
        
        Args:
            n_splits: Number of splits for cross validation
            horizon: Number of steps to forecast
            min_train_size: Minimum size of training set. If None, will be set based on data
            step: Number of steps between training sets
        """
        self.n_splits = n_splits
        self.horizon = horizon
        self.min_train_size = min_train_size
        self.step = step
        
    def _get_time_series_split(self, data: pd.DataFrame) -> TimeSeriesSplit:
        """Create TimeSeriesSplit object with appropriate parameters."""
        if self.min_train_size is None:
            self.min_train_size = len(data) // (self.n_splits + 1)
            
        return TimeSeriesSplit(
            n_splits=self.n_splits,
            test_size=self.horizon,
            gap=0,
            max_train_size=None
        )
    
    def rolling_window_split(
        self, 
        data: pd.DataFrame,
        fixed_window: bool = True
    ) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Generate rolling window splits of the data.
        
        Args:
            data: Time series data
            fixed_window: If True, use fixed size windows. If False, use expanding windows
            
        Returns:
            List of (train_idx, test_idx) tuples
        """
        tss = self._get_time_series_split(data)
        splits = []
        
        for train_idx, test_idx in tss.split(data):
            if fixed_window:
                # For fixed window, only take the last min_train_size points
                if len(train_idx) > self.min_train_size:
                    train_idx = train_idx[-self.min_train_size:]
            splits.append((train_idx, test_idx))
            
        return splits
    
    def cross_validate_model(
        self,
        data: pd.DataFrame,
        model_func: Callable,
        fixed_window: bool = True,
        **model_params
    ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """Perform cross validation using the specified model.
        
        Args:
            data: Time series data
            model_func: Function that creates and fits the model
            fixed_window: Whether to use fixed or expanding window
            **model_params: Parameters to pass to the model function
            
        Returns:
            Tuple of (predictions, actual values) for each split
        """
        splits = self.rolling_window_split(data, fixed_window)
        predictions = []
        actuals = []
        
        for train_idx, test_idx in tqdm(splits, desc="Cross validation"):
            # Get train and test data
            train_data = data.iloc[train_idx]
            test_data = data.iloc[test_idx]
            
            # Fit model and make predictions
            model = model_func(train_data, **model_params)
            pred = model.forecast(steps=len(test_idx))
            
            predictions.append(pred)
            actuals.append(test_data.values)
            
        return predictions, actuals
    
    def cross_validate_arima(
        self,
        data: pd.DataFrame,
        order: Tuple[int, int, int],
        fixed_window: bool = True
    ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
        """Convenience method for ARIMA cross validation.
        
        Args:
            data: Time series data
            order: ARIMA order (p,d,q)
            fixed_window: Whether to use fixed or expanding window
            
        Returns:
            Tuple of (predictions, actual values) for each split
        """
        def arima_func(train_data, order):
            model = ARIMA(train_data, order=order)
            return model.fit()
            
        return self.cross_validate_model(
            data=data,
            model_func=arima_func,
            fixed_window=fixed_window,
            order=order
        )
    
    def get_cv_scores(
        self,
        predictions: List[np.ndarray],
        actuals: List[np.ndarray],
        metric_funcs: dict
    ) -> pd.DataFrame:
        """Calculate cross validation scores.
        
        Args:
            predictions: List of predictions for each split
            actuals: List of actual values for each split
            metric_funcs: Dictionary of metric names and functions
            
        Returns:
            DataFrame with cross validation scores
        """
        scores = []
        for split_idx, (pred, actual) in enumerate(zip(predictions, actuals)):
            split_scores = {'split': split_idx}
            for name, func in metric_funcs.items():
                split_scores[name] = func(actual, pred)
            scores.append(split_scores)
            
        return pd.DataFrame(scores)

class ARIMAHyperparameterTuner:
    """Class to handle hyperparameter tuning for ARIMA models using cross validation.
    
    This class implements a grid search over ARIMA parameters using time series
    cross validation to find the best parameters based on prediction performance.
    """
    
    def __init__(
        self,
        cv_splits: int = 5,
        horizon: int = 10,
        min_train_size: Optional[int] = None,
        metric: str = 'rmse'
    ):
        """Initialize the tuner.
        
        Args:
            cv_splits: Number of cross validation splits
            horizon: Forecast horizon for each validation
            min_train_size: Minimum size of training set
            metric: Metric to optimize ('rmse', 'mae', or 'mse')
        """
        self.cv = TimeSeriesCrossValidator(
            n_splits=cv_splits,
            horizon=horizon,
            min_train_size=min_train_size
        )
        self.metric = metric
        self.results_ = None
        self.best_params_ = None
        
    def _calculate_metric(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        """Calculate the specified metric."""
        if self.metric == 'rmse':
            return np.sqrt(mean_squared_error(y_true, y_pred))
        elif self.metric == 'mae':
            return mean_absolute_error(y_true, y_pred)
        elif self.metric == 'mse':
            return mean_squared_error(y_true, y_pred)
        else:
            raise ValueError(f"Unknown metric: {self.metric}")
    
    def grid_search(
        self,
        data: pd.DataFrame,
        p_range: range,
        d_range: range,
        q_range: range,
        fixed_window: bool = True
    ) -> pd.DataFrame:
        """Perform grid search over ARIMA parameters.
        
        Args:
            data: Time series data
            p_range: Range of p values to try
            d_range: Range of d values to try
            q_range: Range of q values to try
            fixed_window: Whether to use fixed size windows in CV
            
        Returns:
            DataFrame with results for each parameter combination
        """
        results = []
        param_combinations = list(product(p_range, d_range, q_range))
        
        for p, d, q in tqdm(param_combinations, desc="Parameter combinations"):
            try:
                # Get predictions for all CV splits
                predictions, actuals = self.cv.cross_validate_arima(
                    data=data,
                    order=(p, d, q),
                    fixed_window=fixed_window
                )
                
                # Calculate metrics for each split
                split_scores = []
                for pred, actual in zip(predictions, actuals):
                    score = self._calculate_metric(actual, pred)
                    split_scores.append(score)
                
                # Store results
                results.append({
                    'p': p,
                    'd': d,
                    'q': q,
                    f'mean_{self.metric}': np.mean(split_scores),
                    f'std_{self.metric}': np.std(split_scores),
                    'n_splits': len(split_scores),
                    'convergence': True
                })
            except Exception as e:
                # Handle cases where model fails to converge
                results.append({
                    'p': p,
                    'd': d,
                    'q': q,
                    f'mean_{self.metric}': np.inf,
                    f'std_{self.metric}': np.inf,
                    'n_splits': 0,
                    'convergence': False,
                    'error': str(e)
                })
        
        # Convert results to DataFrame
        self.results_ = pd.DataFrame(results)
        
        # Find best parameters
        converged_results = self.results_[self.results_['convergence']]
        if len(converged_results) > 0:
            best_idx = converged_results[f'mean_{self.metric}'].idxmin()
            self.best_params_ = tuple(
                self.results_.loc[best_idx, ['p', 'd', 'q']].astype(int)
            )
        
        return self.results_
    
    def plot_results(self, top_n: int = 10):
        """Plot the top N parameter combinations.
        
        Args:
            top_n: Number of top results to plot
        """
        import matplotlib.pyplot as plt
        import seaborn as sns
        
        if self.results_ is None:
            raise ValueError("No results available. Run grid_search first.")
            
        # Get top N converged results
        top_results = self.results_[self.results_['convergence']].nsmallest(
            top_n, f'mean_{self.metric}'
        )
        
        # Create plot
        plt.figure(figsize=(12, 6))
        sns.barplot(
            data=top_results,
            y=top_results.apply(
                lambda x: f"({int(x['p'])},{int(x['d'])},{int(x['q'])})", 
                axis=1
            ),
            x=f'mean_{self.metric}',
            xerr=top_results[f'std_{self.metric}']
        )
        plt.title(f'Top {top_n} ARIMA Parameters')
        plt.xlabel(f'Mean {self.metric.upper()}')
        plt.ylabel('Parameters (p,d,q)')
        plt.tight_layout()
        
        return plt.gcf()

In [None]:
# Initialize the tuner
tuner = ARIMAHyperparameterTuner(
    cv_splits=10,          # number of CV splits
    horizon=10,           # forecast horizon
    min_train_size=90,   # minimum training size
    metric='rmse'         # metric to optimize
)

# Define parameter ranges to search
# Start with ranges around your auto_arima results
p_range = range(8, 11)    # centered around p=9
d_range = range(0, 2)     # including d=0 and d=1
q_range = range(1, 4)     # centered around q=2

# Run grid search
results = tuner.grid_search(
    data=train_data,      # use only training data
    p_range=p_range,
    d_range=d_range,
    q_range=q_range,
    fixed_window=False     # use fixed size windows
)

# Print best parameters
print(f"Best parameters: {tuner.best_params_}")

# Plot top results
tuner.plot_results(top_n=10)

#### Evaluate the model

In [None]:
arima_test_data = deseasonalized_test_data[col]
scaled_arima_test_data = arima_scaler.transform(arima_test_data)
arima_test_data = pd.DataFrame(scaled_arima_test_data, index=arima_test_data.index, columns=["q_obs"])

In [None]:
arima_test_data.plot()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

forecast_horizon = 10
context_length = 120
forecast = []
p=9; d=0; q=2
for i in tqdm(range(context_length, len(arima_test_data), 5)):
    step_data=arima_test_data[i-context_length:i]
    dry_season = (step_data.abs() <=0.5).all().values[0]
    if not dry_season:
        model = ARIMA(step_data["q_obs"], order=(p, d, q))
        model_fit = model.fit()
        # print(model_fit.summary())
        forecast.append(model_fit.forecast(steps=10))

In [None]:
formatted_forecast = []
for f in forecast:
    formatted_forecast.append({"time":f.index[0], **{f"t+{i+1}":f[i] for i in range(f.index.size)}})
_ = pd.DataFrame(formatted_forecast)

In [None]:
_.set_index("time", inplace=True)

In [None]:
_ = pd.DataFrame(arima_scaler.inverse_transform(_), index=_.index, columns=_.columns)

In [None]:
seasonality_handler.add_seasonality(_)

#### Make Predictions and Evaluate Model

In [None]:
# Make predictions
predictions = model_fit.predict(start=len(train_data), end=len(data)-1,)
rescaled_predictions = arima_scaler.inverse_transform(predictions.to_frame())
predictions = pd.DataFrame(rescaled_predictions, index=predictions.index, columns=['débit_insitu'])
predictions = seasonality_handler.add_seasonality(predictions)

# Calculate error metrics
mse = mean_squared_error(test_data['débit_insitu'], predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_data['débit_insitu'], predictions)

print(f'Mean Squared Error: {mse:.2f}')
print(f'Root Mean Squared Error: {rmse:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')

def smooth(data, window_size): return data.rolling(window=window_size).mean()
# Plot actual vs predicted values
plt.figure(figsize=(15, 6))
plt.plot(test_data.index.get_level_values("time"), test_data['débit_insitu'], label='Actual')
plt.plot(test_data.index.get_level_values("time"), predictions, label='Predicted')
plt.title('ARIMAX: Actual vs Predicted Flow')
plt.xlabel('Date')
plt.ylabel('Flow')
plt.legend()
plt.show()

## Model Diagnostics

Let's examine the model residuals to check if our model assumptions are met.

In [None]:
# Get model residuals
residuals = pd.DataFrame(model_fit.resid)

# Plot residuals
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Model Diagnostics')

# Residuals over time
residuals.plot(ax=axes[0,0], title='Residuals over Time')
axes[0,0].set_xlabel('Date')
axes[0,0].set_ylabel('Residual')

# Residuals histogram
residuals.hist(ax=axes[0,1], bins=30)
axes[0,1].set_title('Residuals Distribution')

# Q-Q plot
from scipy import stats
stats.probplot(residuals.iloc[:,0], dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot')

# Autocorrelation plot
pd.plotting.autocorrelation_plot(residuals.iloc[:,0], ax=axes[1,1])
axes[1,1].set_title('Residuals Autocorrelation')

plt.tight_layout()
plt.show()

## Grid search

In [None]:
from itertools import product
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")

def grid_search_arima(data, p_range, d_range, q_range):
    best_scores = []
    
    for p, d, q in product(p_range, d_range, q_range):
        try:
            model = ARIMA(data, order=(p, d, q))
            results = model.fit()
            best_scores.append({
                'p': p,
                'd': d,
                'q': q,
                'aic': results.aic,
                'bic': results.bic,
                'hqic': results.hqic,
                'loglikelihood': results.llf,
            })
        except:
            continue
    
    return pd.DataFrame(best_scores).set_index(["p", "d", "q"]).sort_values('aic')  # Sort by AIC

# Example usage
p_range = range(5, 7+1)
d_range = range(1, 1+1)
q_range = range(5, 7+1)

results = grid_search_arima(arima_data["q_obs"], p_range, d_range, q_range)
best_params = results.iloc[0]['order']  # Get parameters with lowest AIC

In [None]:
pd.DataFrame(scores).set_index(["p", "d", "q"])

## Conclusions

The ARIMAX model we built demonstrates how to:
1. Incorporate exogenous variables (temperature and marketing spend) into time series forecasting
2. Make predictions on test data
3. Evaluate model performance using various metrics
4. Perform model diagnostics

To improve the model, you could:
1. Tune the ARIMAX parameters (p,d,q) using grid search or AIC/BIC criteria
2. Add seasonal components (SARIMAX)
3. Include more relevant exogenous variables
4. Handle any seasonality or trends in the data preprocessing step

## ARIMAX

We'll split our data into training and testing sets, and prepare the exogenous variables.

#### Data Visualization

Let's visualize our time series data to understand the patterns and relationships.

In [None]:
# Plot time series
fig, axes = plt.subplots(3, 1, figsize=(15, 12))
fig.suptitle('Time Series Components')

# Sales
data['débit_insitu'].plot(ax=axes[0], title='Water flow over Time')
axes[0].set_xlabel('')
axes[0].set_ylabel('débit_insitu')

# Temperature
data['P_cumul_7j'].plot(ax=axes[1], title='Rain cumul over Time')
axes[1].set_xlabel('')
axes[1].set_ylabel('P_cumul_7j')

# Marketing Spend
data['débit_mgb'].plot(ax=axes[2], title='MGB model prediction over Time')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('débit_mgb')

plt.tight_layout()
plt.show()

#### Set feature (exogeneous) and target columns

In [None]:
x_col, y_col = ['P_cumul_7j', 'débit_mgb'], ['débit_insitu']

In [None]:
# Split data into train and test sets (80-20 split)
train_mask = data.index < '2019-01-01'
train_data = data[train_mask]
test_data = data[~train_mask]

#### Remove seassonality

In [None]:
seasson = train_data.groupby(train_data.index.isocalendar().week).mean()

In [None]:
def deseasonalize_weekofyear(data, seasson):
    data['week'] = data.index.isocalendar().week
    data.set_index('week', inplace=True, append=True)
    deseasonalized = data - seasson
    deseasonalized.reset_index(level='week', drop=True, inplace=True)
    return deseasonalized

deseasonalized_train_data = deseasonalize_weekofyear(train_data, seasson)
deseasonalized_test_data = deseasonalize_weekofyear(test_data, seasson)


#### Create features with context and target with horizon

In [None]:
from ombs_senegal.benchmark_model import FeatureGenerator

In [None]:
feature_generator = FeatureGenerator()
x_train_with_context, y_train_with_horizon = feature_generator.generate(deseasonalized_train_data, x_col, y_col)
x_test_with_context, y_test_with_context = feature_generator.generate(deseasonalized_test_data, x_col, y_col)


In [None]:
print(f"Training set size: {len(x_train_with_context)}")
print(f"Test set size: {len(x_test_with_context)}")

#### Normalize data

In [None]:
from sklearn.preprocessing import RobustScaler

features_scaler = RobustScaler()
exog_train = features_scaler.fit_transform(x_train_with_context)
exog_test = features_scaler.transform(x_test_with_context)


#### Build and Train ARIMAX Model

We'll use the SARIMAX class from statsmodels to implement our ARIMAX model. The order parameters (p,d,q) will be set to (1,1,1) for this example, but in practice, you should use techniques like AIC/BIC or grid search to find optimal parameters.

In [None]:
# Initialize and train ARIMAX model
model = SARIMAX(y_train_with_horizon,
                exog=exog_train,
                order=(1, 1, 1),
                enforce_stationarity=False,
                enforce_invertibility=False)

model_fit = model.fit(disp=False)
print(model_fit.summary())