## Libraries

In [20]:
import matplotlib
matplotlib.use('TkAgg')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import pmdarima as pm
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

## Load the data

In [3]:
# Load the data
data = pd.read_csv('crypto-markets.csv')
btc_data = data[data['symbol'] == 'BTC'].copy()
btc_data.loc[:, 'date'] = pd.to_datetime(btc_data['date'])
btc_data.set_index('date', inplace=True)
missing_values = btc_data.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 slug           0
symbol         0
name           0
ranknow        0
open           0
high           0
low            0
close          0
volume         0
market         0
close_ratio    0
spread         0
dtype: int64


In [4]:
# Resample to daily frequency
btc_data_daily = btc_data.resample('D').ffill()
#print(btc_data_daily)

#plt.plot(btc_data.index, btc_data['close'])
#plt.show()

In [5]:
# Train test split

# 1866 records
to_row = int(len(btc_data_daily) * 0.9)  # 1679 records

training_data = btc_data_daily[0:to_row]['close'] #training data - 1679 rekordów
testing_data = btc_data_daily[to_row:]['close'] #testing data - 187 rekordów


In [17]:
print(training_data)
print(len(training_data))
print(len(testing_data))

date
2013-04-28      134.21
2013-04-29      144.54
2013-04-30      139.00
2013-05-01      116.99
2013-05-02      105.21
                ...   
2017-11-27     9818.35
2017-11-28    10058.80
2017-11-29     9888.61
2017-11-30    10233.60
2017-12-01    10975.60
Freq: D, Name: close, Length: 1679, dtype: float64
1679
187


In [19]:
# Checking for stationarity
adf_test = adfuller(training_data)
print(f'p-value: {adf_test[1]}')


p-value: 1.0


In [8]:
# Differencing till p-value < 0.05
#first differencing:
df_train_diff = training_data.diff().dropna()
adf_test = adfuller(df_train_diff)
print(f'p-value: {adf_test[1]}')


p-value: 0.9556425044957317


In [9]:
#second differencing:
df_train_diff = df_train_diff.diff().dropna()
adf_test = adfuller(df_train_diff)
print(f'p-value: {adf_test[1]}')
df_train_diff.plot()
plt.show()

p-value: 1.3345829350039024e-24


In [22]:
# plots for p, q analysis
acf_diff = plot_acf(df_train_diff)
pacf_diff = plot_pacf(df_train_diff)
plt.show()


In [40]:
model = ARIMA(training_data, order=(4,2,1))
model_fit = model.fit()
print(model_fit.summary())

                               SARIMAX Results                                
Dep. Variable:                  close   No. Observations:                 1679
Model:                 ARIMA(4, 2, 1)   Log Likelihood               -9611.262
Date:                Sun, 28 Apr 2024   AIC                          19234.524
Time:                        22:58:20   BIC                          19267.072
Sample:                    04-28-2013   HQIC                         19246.581
                         - 12-01-2017                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0711      0.009      7.913      0.000       0.053       0.089
ar.L2          0.0025      0.008      0.301      0.763      -0.014       0.019
ar.L3          0.0039      0.008      0.515      0.6

In [31]:
#Wykonywanie predykcji
residuals = model_fit.resid[1:]
fig, ax = plt.subplots(1,2)
residuals.plot(title='Residuals', ax=ax[0])
residuals.plot(title='Density', kind='kde', ax=ax[1])
plt.show()

In [13]:
#Sprawdzenie czegos
#acf_res = plot_acf(residuals)

#pacf_res = plot_pacf(residuals)

In [33]:
#Zaznaczanie predykcji na wykresie
forecast_test = model_fit.forecast(len(testing_data))
btc_data_daily['close'].plot(label='Original Data')
print(forecast_test)

plt.plot(testing_data.index, forecast_test, label='Forecasted Data')
plt.xlabel('Time')
plt.ylabel('Close Value')
plt.title('Bitcoin Daily Close Value with Forecast')

# Show legend
#plt.legend()

# Show plot
plt.show()

2017-12-02    11158.543366
2017-12-03    11349.665897
2017-12-04    11485.211388
2017-12-05    11571.249882
2017-12-06    11714.748549
                  ...     
2018-06-02    38680.987705
2018-06-03    38832.459885
2018-06-04    38983.932066
2018-06-05    39135.404246
2018-06-06    39286.876427
Freq: D, Name: predicted_mean, Length: 187, dtype: float64


In [25]:
auto_arima = pm.auto_arima(training_data, stepwise=False, seasonal=False)
auto_arima

In [26]:
auto_arima.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,1679.0
Model:,"SARIMAX(0, 2, 5)",Log Likelihood,-9608.624
Date:,"Sun, 28 Apr 2024",AIC,19231.248
Time:,22:33:29,BIC,19269.221
Sample:,04-28-2013,HQIC,19245.315
,- 12-01-2017,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.0907,0.078,1.169,0.242,-0.061,0.243
ma.L1,-0.8896,0.008,-111.974,0.000,-0.905,-0.874
ma.L2,-0.0477,0.012,-4.078,0.000,-0.071,-0.025
ma.L3,-0.0146,0.011,-1.388,0.165,-0.035,0.006
ma.L4,-0.1302,0.011,-12.347,0.000,-0.151,-0.110
ma.L5,0.1210,0.008,14.380,0.000,0.105,0.138
sigma2,5540.6581,49.823,111.207,0.000,5443.007,5638.309

0,1,2,3
Ljung-Box (L1) (Q):,0.0,Jarque-Bera (JB):,61341.88
Prob(Q):,0.95,Prob(JB):,0.0
Heteroskedasticity (H):,14.37,Skew:,0.87
Prob(H) (two-sided):,0.0,Kurtosis:,32.58


In [34]:
forecast_test_auto = auto_arima.predict(n_periods=len(testing_data))
btc_data_daily['close'].plot(label='Original Data')
plt.plot(testing_data.index, forecast_test, label='Forecasted Data')
plt.plot(testing_data.index, forecast_test_auto, label='Automatically Forecasted Data')
plt.xlabel('Time')
plt.ylabel('Close Value')
plt.legend()
plt.show()

In [36]:
#Błędy dla ustawionej arimy
mae = mean_absolute_error(testing_data, forecast_test)
mape = mean_absolute_percentage_error(testing_data, forecast_test)
rmse = np.sqrt(mean_squared_error(testing_data, forecast_test))

print(f'mae - manual: {mae}')
print(f'mape - manual: {mape}')
print(f'rmse - manual: {rmse}')

mae - manual: 15476.131976143073
mape - manual: 1.8006436048088315
rmse - manual: 18263.539106652577


In [29]:
#Błędy dla automatycznej arimy
mae = mean_absolute_error(testing_data, forecast_test_auto)
mape = mean_absolute_percentage_error(testing_data, forecast_test_auto)
rmse = np.sqrt(mean_squared_error(testing_data, forecast_test_auto))

print(f'mae - auto: {mae}')
print(f'mape - auto: {mape}')
print(f'rmse - auto: {rmse}')

mae - auto: 15846.159184876356
mape - auto: 1.8453252176509347
rmse - auto: 18766.820014796584


In [None]:
#Tak robilem najpierw, źle!!!!!!!!!!!!!!!!!!!!!!!
from statsmodels.tsa.arima.model import ARIMA
# Train test split

# 1866 wszystkich wierszy
to_row = int(len(btc_data_daily) * 0.9)  # 1679 wierszy

training_data = list(btc_data_daily[0:to_row]['close'])
testing_data = list(btc_data_daily[to_row:]['close'])

model_predictions = []
n_test_obs = len(testing_data)

for i in range(n_test_obs):
    model = ARIMA(training_data, order=(2, 1, 0))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    model_predictions.append(yhat)
    actual_test_value = testing_data[i]
    training_data.append(actual_test_value)
    #print(output[0])
    #break

print(model_fit.summary())
# Visualization
plt.figure(figsize=(15, 9))
plt.grid(True)

date_range = btc_data_daily[to_row:].index
plt.plot(date_range, model_predictions, color='blue', marker='o',linestyle='dashed', label='BTC predicted price')
plt.plot(date_range, testing_data, color='red', label='BTC actual')
plt.title('Bitcoin price predicition')

plt.xlabel('Dates')
plt.ylabel('Closing prices')
#plt.plot(btc_data_daily[0:to_row]['close'], 'green', label='Train data')
#plt.plot(btc_data_daily[to_row:]['close'], 'blue', label='Test data')
plt.legend()
plt.show()

                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 1865
Model:                 ARIMA(2, 1, 0)   Log Likelihood              -12841.691
Date:                Sun, 28 Apr 2024   AIC                          25689.381
Time:                        15:09:20   BIC                          25705.973
Sample:                             0   HQIC                         25695.495
                               - 1865                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0855      0.005     15.666      0.000       0.075       0.096
ar.L2         -0.0470      0.006     -7.311      0.000      -0.060      -0.034
sigma2      5.649e+04    399.024    141.572      0.0