In [1]:
import pandas as pd

df = pd.read_csv('./fredupdated.csv', parse_dates=['observation_date'])


df.set_index('observation_date', inplace=True)

df.head(5)

start_date = '1959-01-01'

df = df.loc[start_date:]

df['Inflation_Rate'] = (df['CPIAUCSL'].pct_change(periods=4)) * 100


df.drop(columns=['CPIAUCSL'], inplace=True)


df.head(5)

df.drop(columns=['CES0500000003'], inplace=True)

predictor_cols = ['UNRATE', 'GDP', 'PCE', 'FEDFUNDS', 'GS10', 'M2SL', 'GDPC1', 'CIVPART', 'PPIACO']

print(df.info())
df_model = df.dropna(subset= predictor_cols).copy()



<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 265 entries, 1959-01-01 to 2025-01-01
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   UNRATE          264 non-null    float64
 1   GDP             264 non-null    float64
 2   PCE             264 non-null    float64
 3   FEDFUNDS        264 non-null    float64
 4   GS10            264 non-null    float64
 5   M2SL            264 non-null    float64
 6   GDPC1           264 non-null    float64
 7   CIVPART         264 non-null    float64
 8   PPIACO          264 non-null    float64
 9   Inflation_Rate  261 non-null    float64
dtypes: float64(10)
memory usage: 22.8 KB
None


In [2]:
from pygam import LinearGAM, s
from sklearn.model_selection import train_test_split

X = df_model[predictor_cols].values
y = df_model['Inflation_Rate'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [3]:
inflation_data = df_model['Inflation_Rate']
dates = df_model.index


In [15]:
# 2. Calculate train size and split dates before creating Series
train_size = len(y_train)  # Use y_train length, not train_data length
train_dates = dates[:train_size]
test_dates = dates[train_size:]

In [16]:
# 3. Create Series with proper datetime index
train_data = pd.Series(y_train, index=train_dates)
test_data = pd.Series(y_test, index=test_dates)  # Make sure to include index here

# 4. Clean NaN values while preserving the datetime index
train_data = train_data.dropna()
test_data = test_data.dropna()

In [17]:
# Verify the data structure
print("\nData Structure Check:")
print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")
print(f"\nTraining date range: {train_data.index[0]} to {train_data.index[-1]}")
print(f"Testing date range: {test_data.index[0]} to {test_data.index[-1]}")


Data Structure Check:
Training data shape: (207,)
Testing data shape: (53,)

Training date range: 1960-01-01 00:00:00 to 2011-07-01 00:00:00
Testing date range: 2011-10-01 00:00:00 to 2024-10-01 00:00:00


In [18]:
# 4: ARIMA with Standard Parameters
# Fit ARIMA model (p=1, d=1, q=1) - standard parameters for economic data
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
model = ARIMA(train_data, order=(1, 1, 1))
model_fit = model.fit()
model_fit.summary()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


0,1,2,3
Dep. Variable:,y,No. Observations:,207.0
Model:,"ARIMA(1, 1, 1)",Log Likelihood,-215.096
Date:,"Mon, 10 Mar 2025",AIC,436.193
Time:,12:16:29,BIC,446.176
Sample:,01-01-1960,HQIC,440.23
,- 07-01-2011,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.4502,0.159,2.825,0.005,0.138,0.763
ma.L1,-0.0980,0.172,-0.570,0.568,-0.435,0.239
sigma2,0.4722,0.026,18.426,0.000,0.422,0.522

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,448.56
Prob(Q):,0.97,Prob(JB):,0.0
Heteroskedasticity (H):,2.71,Skew:,-0.45
Prob(H) (two-sided):,0.0,Kurtosis:,10.17


In [19]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math
import numpy as np

def evaluate_model(actual, predicted, dataset_name=""):
    """
    Evaluate a model's performance and return metrics in a dictionary.
    Handles index alignment between actual and predicted values.
    """
    # Convert inputs to pandas Series if they aren't already
    if not isinstance(actual, pd.Series):
        actual = pd.Series(actual)
    if not isinstance(predicted, pd.Series):
        predicted = pd.Series(predicted)
    
    # Align the indices
    if hasattr(predicted, 'index'):
        actual = actual[predicted.index]
    
    # Remove any NaN values
    mask = ~(np.isnan(actual) | np.isnan(predicted))
    actual = actual[mask]
    predicted = predicted[mask]
    
    # Calculate metrics
    mse = mean_squared_error(actual, predicted)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    accuracy = 1 - (rmse / np.mean(actual))
    
    metrics = {
        "Dataset": dataset_name,
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "R^2": r2,
        "Accuracy": accuracy,
        "Accuracy (%)": accuracy * 100
    }
    
    return metrics

def display_metrics_table(train_metrics, test_metrics, model_name="Model"):
    """
    Display a formatted table comparing training and test metrics.
    
    Parameters:
    -----------
    train_metrics : dict
        Dictionary containing training metrics
    test_metrics : dict
        Dictionary containing test metrics
    model_name : str, optional
        Name of the model for the table header
    """
    print(f"\nModel Evaluation: {model_name}")
    print("-" * 60)
    metrics_names = ["MSE", "RMSE", "MAE", "R^2", "Accuracy", "Accuracy (%)"]
    print(f"{'Metric':<15} {'Training Set':<20} {'Test Set':<20}")
    print("-" * 60)
    
    for metric in metrics_names:
        if metric == "Accuracy (%)":
            print(f"{metric:<15} {train_metrics[metric]:.2f}% {' '*12} {test_metrics[metric]:.2f}%")
        else:
            print(f"{metric:<15} {train_metrics[metric]:.4f} {' '*14} {test_metrics[metric]:.4f}")
    
    print("-" * 60)

In [20]:
# Make predictions with proper indexing
train_predictions = pd.Series(
    model_fit.predict(start=0, end=len(train_data)-1),
    index=train_data.index
)
test_predictions = pd.Series(
    model_fit.forecast(steps=len(test_data)),
    index=test_data.index
)

# Before evaluation, check for NaN values
print("\nData Quality Check:")
print(f"Train data NaN count: {train_data.isna().sum()}")
print(f"Train predictions NaN count: {train_predictions.isna().sum()}")
print(f"Test data NaN count: {test_data.isna().sum()}")
print(f"Test predictions NaN count: {test_predictions.isna().sum()}")



Data Quality Check:
Train data NaN count: 0
Train predictions NaN count: 0
Test data NaN count: 0
Test predictions NaN count: 0


In [21]:
train_predictions = model_fit.predict(start=0, end=len(train_data)-1)
test_predictions = model_fit.forecast(steps=len(test_data))

train_metrics = evaluate_model(train_data, train_predictions, "Training")
test_metrics = evaluate_model(test_data, test_predictions, "Test")
display_metrics_table(train_metrics, test_metrics, model_name="ARIMA(1,1,1)")


Model Evaluation: ARIMA(1,1,1)
------------------------------------------------------------
Metric          Training Set         Test Set            
------------------------------------------------------------
MSE             0.4795                6.0985
RMSE            0.6924                2.4695
MAE             0.4809                2.2442
R^2             0.9423                -0.4780
Accuracy        0.8290                0.0596
Accuracy (%)    82.90%              5.96%
------------------------------------------------------------


In [23]:
# --- Model 5: Random Forest Regression ---
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\n--- Model 5: Random Forest Regression ---")
rf_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
print("Top 5 Important Features:")
print(rf_importance.head())

# Evaluate Random Forest
model_results.append(evaluate_model(y_test, y_pred_rf, "Random Forest"))


ValueError: Input y contains NaN.