In [14]:
# loading libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [15]:
# loading data
data = pd.read_excel('okdata2.xlsx')
data

Unnamed: 0,Year,Month,Inflation
0,2001,1,7.4
1,2001,2,6.6
2,2001,3,6.2
3,2001,4,6.6
4,2001,5,6.9
...,...,...,...
264,2023,1,16.6
265,2023,2,18.4
266,2023,3,16.1
267,2023,4,14.7


In [16]:
# adding new column
from pandas.tseries.offsets import MonthEnd
data['Date'] = pd.to_datetime(data[['Year', 'Month']].assign(DAY=1)) + MonthEnd(1)

In [17]:
# ordering ascending column "Date" 
data = data.sort_values(by=['Date'])
data

Unnamed: 0,Year,Month,Inflation,Date
0,2001,1,7.4,2001-01-31
1,2001,2,6.6,2001-02-28
2,2001,3,6.2,2001-03-31
3,2001,4,6.6,2001-04-30
4,2001,5,6.9,2001-05-31
...,...,...,...,...
264,2023,1,16.6,2023-01-31
265,2023,2,18.4,2023-02-28
266,2023,3,16.1,2023-03-31
267,2023,4,14.7,2023-04-30


In [24]:
# selecting data for modeling
df = data[['Date', 'Inflation']]

In [25]:
# seting "Date" column as index
df.set_index('Date', inplace=True)

In [26]:
# discarding empty rows
df.dropna(subset=['Inflation'], inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['Inflation'], inplace=True)


Unnamed: 0_level_0,Inflation
Date,Unnamed: 1_level_1
2001-01-31,7.4
2001-02-28,6.6
2001-03-31,6.2
2001-04-30,6.6
2001-05-31,6.9
...,...
2022-12-31,16.6
2023-01-31,16.6
2023-02-28,18.4
2023-03-31,16.1


In [None]:
# Od tego miejsca ctrl C jak leci

In [28]:
# check if inflation series is stationary
from statsmodels.tsa.stattools import adfuller

# ADF test
def adf_test(series):
    result = adfuller(series, autolag='AIC')
    print('1. ADF: ', result[0])
    print('2. P-value: ', result[1])
    print('3. Num of Lags: ', result[2])
    print('4. Num of Observations: ', result[3])
    print('5. Critial Values:')
    for key, value in result[4].items():
        print('\t', key, ': ', value)
        
    if result[1] <= 0.05:
        print('\nStrong evidence against the null hypothesis (H0), reject the null hypothesis. Data has no unit root and is stationary.')
    else:
        print('\nWeak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary.')

# run function
adf_test(df['Inflation'])

1. ADF:  -1.6338569545017785
2. P-value:  0.46547248157920273
3. Num of Lags:  13
4. Num of Observations:  254
5. Critial Values:
	 1% :  -3.456360306409983
	 5% :  -2.8729872043802356
	 10% :  -2.572870232500465

Weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary.


In [30]:
# finding differencing value
from pmdarima.arima.utils import ndiffs
print(ndiffs(df['Inflation'], test='adf'))
print(ndiffs(df['Inflation'], test='kpss'))
print(ndiffs(df['Inflation'], test='pp'))

1
2
1


In [32]:
# auto ARIMA function
from pmdarima import auto_arima
stepwise_fit = auto_arima(df['Inflation'], trace=True, suppress_warnings=True)
stepwise_fit.summary()

Performing stepwise search to minimize aic
 ARIMA(2,2,2)(0,0,0)[0]             : AIC=371.871, Time=0.77 sec
 ARIMA(0,2,0)(0,0,0)[0]             : AIC=469.264, Time=0.06 sec
 ARIMA(1,2,0)(0,0,0)[0]             : AIC=429.725, Time=0.10 sec
 ARIMA(0,2,1)(0,0,0)[0]             : AIC=378.736, Time=0.13 sec
 ARIMA(1,2,2)(0,0,0)[0]             : AIC=inf, Time=0.52 sec
 ARIMA(2,2,1)(0,0,0)[0]             : AIC=372.683, Time=0.42 sec
 ARIMA(3,2,2)(0,0,0)[0]             : AIC=373.580, Time=0.67 sec
 ARIMA(2,2,3)(0,0,0)[0]             : AIC=inf, Time=1.13 sec
 ARIMA(1,2,1)(0,0,0)[0]             : AIC=370.699, Time=0.28 sec
 ARIMA(0,2,2)(0,0,0)[0]             : AIC=371.480, Time=0.24 sec
 ARIMA(2,2,0)(0,0,0)[0]             : AIC=406.732, Time=0.14 sec
 ARIMA(1,2,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.84 sec

Best model:  ARIMA(1,2,1)(0,0,0)[0]          
Total fit time: 5.316 seconds


0,1,2,3
Dep. Variable:,y,No. Observations:,268.0
Model:,"SARIMAX(1, 2, 1)",Log Likelihood,-182.35
Date:,"Tue, 30 May 2023",AIC,370.699
Time:,06:49:45,BIC,381.45
Sample:,01-31-2001,HQIC,375.018
,- 04-30-2023,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.2912,0.049,5.912,0.000,0.195,0.388
ma.L1,-0.9717,0.010,-97.619,0.000,-0.991,-0.952
sigma2,0.2287,0.012,19.225,0.000,0.205,0.252

0,1,2,3
Ljung-Box (L1) (Q):,0.03,Jarque-Bera (JB):,1004.46
Prob(Q):,0.87,Prob(JB):,0.0
Heteroskedasticity (H):,2.95,Skew:,-0.42
Prob(H) (two-sided):,0.0,Kurtosis:,12.48


In [33]:
# p=2, d=0, q=1
from statsmodels.tsa.arima.model import ARIMA

# fitting the model
model = ARIMA(df['Inflation'], order=(2,0,1), freq='M')
model_fit = model.fit()
model_fit.summary()

  self._init_dates(dates, freq)


0,1,2,3
Dep. Variable:,Inflation,No. Observations:,268.0
Model:,"ARIMA(2, 0, 1)",Log Likelihood,-180.01
Date:,"Tue, 30 May 2023",AIC,370.02
Time:,06:50:13,BIC,387.975
Sample:,01-31-2001,HQIC,377.231
,- 04-30-2023,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.3317,1.194,2.790,0.005,0.991,5.673
ar.L1,1.9256,0.055,35.129,0.000,1.818,2.033
ar.L2,-0.9331,0.055,-16.987,0.000,-1.041,-0.825
ma.L1,-0.7459,0.082,-9.123,0.000,-0.906,-0.586
sigma2,0.2204,0.013,17.314,0.000,0.195,0.245

0,1,2,3
Ljung-Box (L1) (Q):,1.48,Jarque-Bera (JB):,311.79
Prob(Q):,0.22,Prob(JB):,0.0
Heteroskedasticity (H):,2.77,Skew:,0.37
Prob(H) (two-sided):,0.0,Kurtosis:,8.23


In [34]:
# predict values
pred = model_fit.predict(start=0, end=len(df) - 1, typ='levels', dynamic=False)

In [35]:
# display last rows
pred.tail()

2022-12-31    17.519751
2023-01-31    16.346135
2023-02-28    16.310558
2023-03-31    18.407467
2023-04-30    15.578614
Freq: M, Name: predicted_mean, dtype: float64

In [37]:
# root mean squared error
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(pred, df['Inflation'], squared=False)
rmse

0.5310259998130197

In [38]:
# mean absolute error
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(pred, df['Inflation'])
mae

0.34787320631204216

In [40]:
# mean absolute percentage error
mape = np.mean(np.abs(df['Inflation'] - pred) / df['Inflation']) * 100
mape

inf

In [41]:
# correlation
corr = np.corrcoef(pred, df['Inflation'])[0,1]
corr

0.9887431785666918

In [42]:
# predict values
forecast = model_fit.predict(start=0, end=len(df) + 4, typ='levels', dynamic=False)

In [43]:
# display forecasted values
forecast.tail(5)

2023-05-31    13.963218
2023-06-30    13.195508
2023-07-31    12.404729
2023-08-31    11.598389
2023-09-30    10.783611
Freq: M, Name: predicted_mean, dtype: float64