# ARIMA

## Importing libraries

In [1]:
import datetime as dt
import os
import pandas as pd
import numpy as np
import csv
import warnings
from pandas import Series


# Files
from indicators import get_momentum, get_RSI, get_sma, plot_stock_prices_prediction
from util import fetchOnlineData, slice_df

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import tree, metrics, neighbors

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

# ARIMA
import statsmodels
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
from statsmodels.graphics.tsaplots import plot_acf

# TA Library (https://github.com/bukosabino/ta)
from ta import *

# Add plotly for interactive charts
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools




numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



## Initial variables

In [2]:
symbol = "AABA"

# We'll look back 365 days
start_d = dt.date.today() - dt.timedelta(365)

## Getting data from Yahoo

In [3]:
yesterday = dt.date.today() - dt.timedelta(1)
portf_value = fetchOnlineData(start_d, symbol, yesterday)

[*********************100%***********************]  1 of 1 downloaded


In [4]:
# Normalize the prices Dataframe
normed = portf_value.copy()

In [5]:
# Convert price column to float
#normed[symbol] = pd.to_numeric(normed[symbol], downcast='float', errors='coerce').fillna(0)

In [6]:
normed.info()
normed.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2018-03-09 to 2019-03-08
Data columns (total 1 columns):
Adj Close    251 non-null float64
dtypes: float64(1)
memory usage: 3.9 KB


Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
2018-03-09,76.620003
2018-03-12,77.559998
2018-03-13,75.970001
2018-03-14,77.199997
2018-03-15,79.419998


## Getting indicator values

In [7]:
def get_indicators(normed, symbol):

    # Compute momentum
    sym_mom = get_momentum(normed[symbol], window=10)

    # ****Relative Strength Index (RSI)****
    # Compute RSI
    rsi_value = get_RSI(normed[symbol], 7)

    # ****Simple moving average (SMA)****
    # Compute SMA
    sma, q = get_sma(normed[symbol], window=10)
    return sym_mom, sma, q, rsi_value

#normed = scaling_data(normed, symbol)

normed['date'] = portf_value.index
normed.set_index('date', inplace=True)
normed.rename(columns={'Adj Close': symbol}, inplace=True)


In [8]:
# Get indicators
sym_mom, sma, q, rsi_value = get_indicators(normed, symbol)


# Create momentum column
normed['Momentum'] = sym_mom

# Create SMA column
normed['SMA'] = sma

# Create SMA column
normed['RSI'] = rsi_value

date
2018-03-23   -0.047246
2018-03-26   -0.022563
2018-03-27   -0.038700
2018-03-28   -0.059067
2018-03-29   -0.067741
2018-04-02   -0.103880
2018-04-03   -0.093911
2018-04-04   -0.120779
2018-04-05   -0.111310
2018-04-06   -0.094140
2018-04-09   -0.062466
2018-04-10   -0.066350
2018-04-11   -0.033959
2018-04-12   -0.028910
2018-04-13   -0.064155
2018-04-16   -0.023464
2018-04-17    0.007940
2018-04-18    0.040276
2018-04-19    0.038051
2018-04-20    0.051961
2018-04-23    0.021186
2018-04-24   -0.021475
2018-04-25   -0.032318
2018-04-26   -0.010490
2018-04-27    0.006350
2018-04-30    0.002288
2018-05-01   -0.016880
2018-05-02   -0.029176
2018-05-03   -0.022771
2018-05-04    0.023220
                ...   
2019-01-25    0.033434
2019-01-28    0.037214
2019-01-29    0.040135
2019-01-30    0.083466
2019-01-31    0.072312
2019-02-01    0.065133
2019-02-04    0.058323
2019-02-05    0.111728
2019-02-06    0.112454
2019-02-07    0.071094
2019-02-08    0.050598
2019-02-11    0.050291
2019-0

In [9]:
normed.info()
# Clean nan values
normed = normed.fillna(0)

# Sort dataframe by index
normed.sort_index()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 251 entries, 2018-03-09 to 2019-03-08
Data columns (total 4 columns):
AABA        251 non-null float64
Momentum    241 non-null float64
SMA         242 non-null float64
RSI         251 non-null float64
dtypes: float64(4)
memory usage: 9.8 KB


Unnamed: 0_level_0,AABA,Momentum,SMA,RSI
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-03-09,76.620003,0.000000,0.000000,56.744170
2018-03-12,77.559998,0.000000,0.000000,56.744170
2018-03-13,75.970001,0.000000,0.000000,56.744170
2018-03-14,77.199997,0.000000,0.000000,56.744170
2018-03-15,79.419998,0.000000,0.000000,56.744170
2018-03-16,79.900002,0.000000,0.000000,56.744170
2018-03-19,77.839996,0.000000,0.000000,56.744170
2018-03-20,79.070000,0.000000,0.000000,61.838325
2018-03-21,78.070000,0.000000,0.000000,55.624824
2018-03-22,74.570000,0.000000,77.621999,39.442049


In [10]:
#missing data
total = normed.isnull().sum().sort_values(ascending=False)
percent = (normed.isnull().sum()/normed.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print(missing_data)

          Total  Percent
RSI           0      0.0
SMA           0      0.0
Momentum      0      0.0
AABA          0      0.0


In [11]:
corr_df = normed.corr(method='pearson')
print("--------------- CORRELATIONS ---------------")
print(corr_df)

# Define X and y
feature_cols = ['Momentum', 'RSI']
X = normed[feature_cols]
y = normed[symbol]

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=False)

--------------- CORRELATIONS ---------------
              AABA  Momentum       SMA       RSI
AABA      1.000000   0.27371  0.145045  0.397336
Momentum  0.273710   1.00000 -0.002980  0.869540
SMA       0.145045  -0.00298  1.000000 -0.023346
RSI       0.397336   0.86954 -0.023346  1.000000


# ARIMA Model

In [19]:
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
    # prepare training dataset
    train_size = int(len(X) * 0.66)
    train, test = X[0:train_size], X[train_size:]
    history = [x for x in train]
    # make predictions
    predictions = list()
    for t in range(len(test)):
        model = ARIMA(history, order=arima_order)
        model_fit = model.fit(disp=0)
        yhat = model_fit.forecast()[0]
        predictions.append(yhat)
        history.append(test[t])
    # calculate out of sample error
    error = mean_squared_error(test, predictions)
    return error

In [20]:
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    dataset = dataset.astype('float32')
    best_score, best_cfg = float("inf"), None
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_arima_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                    print('ARIMA%s MSE=%.3f' % (order,mse))
                except:
                    continue
    print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))

## Getting best parameters values

In [None]:
series =  pd.Series(normed[symbol].values)
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)

ARIMA(0, 0, 0) MSE=55.746
ARIMA(0, 0, 1) MSE=17.231
ARIMA(0, 1, 0) MSE=1.480
ARIMA(0, 1, 1) MSE=1.448
ARIMA(0, 1, 2) MSE=1.475
ARIMA(0, 2, 0) MSE=2.345
ARIMA(0, 2, 1) MSE=1.500
ARIMA(0, 2, 2) MSE=1.466
ARIMA(1, 0, 0) MSE=1.439
ARIMA(1, 0, 1) MSE=1.403
ARIMA(1, 0, 2) MSE=1.421
ARIMA(1, 1, 0) MSE=1.454
ARIMA(1, 2, 0) MSE=2.049
ARIMA(1, 2, 1) MSE=1.473
ARIMA(2, 0, 0) MSE=1.409
ARIMA(2, 0, 1) MSE=1.405
ARIMA(2, 0, 2) MSE=1.415
ARIMA(2, 1, 0) MSE=1.470
ARIMA(2, 1, 1) MSE=1.471
ARIMA(2, 2, 0) MSE=2.049
ARIMA(4, 0, 0) MSE=1.418
ARIMA(4, 0, 1) MSE=1.389
ARIMA(4, 0, 2) MSE=1.571
ARIMA(4, 1, 0) MSE=1.455
ARIMA(4, 1, 1) MSE=1.478
ARIMA(4, 2, 0) MSE=1.730
ARIMA(4, 2, 1) MSE=1.477
ARIMA(6, 0, 0) MSE=1.464
ARIMA(6, 0, 1) MSE=1.478
ARIMA(6, 1, 0) MSE=1.561
ARIMA(6, 1, 1) MSE=1.566
ARIMA(6, 1, 2) MSE=1.648
ARIMA(6, 2, 0) MSE=1.765
ARIMA(6, 2, 1) MSE=1.762
ARIMA(8, 0, 0) MSE=1.540
ARIMA(8, 0, 1) MSE=1.560
ARIMA(8, 1, 0) MSE=1.638
ARIMA(8, 1, 1) MSE=1.664
ARIMA(8, 2, 0) MSE=1.776


## Fitting model and plotting

In [None]:
X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
    model = ARIMA(history, order=(0,1,0))
    model_fit = model.fit(disp=0)
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    print('predicted=%f, expected=%f' % (yhat, obs))
error = mean_squared_error(test, predictions)
print('Test MSE: %.3f' % error)
# plot
plt.plot(test)
plt.plot(predictions, color='red')
plt.title("ARIMA Rolling Forecast Line Plot")
plt.show()

## Showing summary

In [None]:
print(model_fit.summary())
# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
plt.title("ARMA Fit Residual Error Line Plot")
plt.show()
residuals.plot(kind='kde')
plt.title("ARMA Fit Residual Error Density Plot")
plt.grid("on")
plt.show()
print(residuals.describe())

## RMS

In [None]:
rms=np.sqrt(np.mean(np.power((np.array(history)-np.array(predictions)),2)))
print('RMS: ', rms)

## Saving an ARIMA Time Series Forecasting Model

The statsmodels library provides an implementation of ARIMA for use in Python. ARIMA models can be saved to file for later use in making predictions on new data. There is a bug in the earlier versions to 0.9 of the statsmodels library that prevents saved models from being loaded.


In [None]:
print('statsmodels: %s' % statsmodels.__version__)

In [None]:
# save model
model_fit.save('arima_model.pkl')


## Loading a model and predicting future prices

In [None]:
# load model
loaded = ARIMAResults.load('arima_model.pkl')
lookback_date = dt.date.today() - dt.timedelta(180)
start_d = dt.date.today()
#start_d = dt.datetime.strptime(start_d, '%Y-%m-%d')

end_d = dt.date.today() + dt.timedelta(7)
#end_d = dt.datetime.strptime(end_d, '%Y-%m-%d')

# Calculate steps
days = (end_d - start_d).days
forecast = loaded.forecast(steps=days)[0] 

# Setting dates for dataframe
dates = pd.date_range(start_d, periods=days)


df=pd.DataFrame(forecast)
df['Dates'] = dates
df.set_index('Dates', inplace=True)
df.rename(columns = {0:'Price'}, inplace=True)


## Plotting predictions

In [None]:
dates = pd.date_range(lookback_date, dt.date.today())
df_prices = slice_df(portf_value, dates)


In [None]:
# plot
plt.figure(figsize=(12,8))
plt.plot(df_prices.index, df_prices['Adj Close'], label='Price')
plt.plot(df, label='Prediction')
plt.legend()


In [None]:


plot_acf(df['Price'])
plt.grid(True)
plt.xlabel("Lags")
plt.ylabel("Autocorrelation")
plt.show()