# Data Handling for Multiple Time Series

In [1]:
import scipy.stats as stats
import pylab
import numpy as np
import pandas as pd
import statsmodels.api as sm
import mplfinance as fplt
import math
from matplotlib import pyplot as plt
import scipy
import BarSampler as bs
import ExpSmooth as es

In [2]:
def get_log_return(df):
    log_returns = pd.Series(np.log((df['close'] - df['open'])/df['open'] + 1),name='log returns')
    return pd.concat([df,log_returns], axis=1)

We will begin by extracting trade data from a trade file. Here we use the function es.xExtractandClean to not only extract all the data from a given stock, but we also clean the data as well. This includes removing certian types of trades and excluding certian time frames.

In [3]:
AAPL_Price_Vol_Data = es.xExtractandClean("/Users/michael/Data/TRADE_20181105.csv",'AAPL')

  if (await self.run_code(code, result,  async_=asy)):


In [4]:
IBM_Price_Vol_Data = es.xExtractandClean("/Users/michael/Data/TRADE_20181105.csv",'IBM')

In [5]:
MSFT_Price_Vol_Data = es.xExtractandClean("/Users/michael/Data/TRADE_20181105.csv",'MSFT')

In [6]:
MSFT_Price_Vol_Data[0:5]

Unnamed: 0_level_0,Symbol,Trade Volume,Trade Price,Sale Condition,Trade Correction Indicator
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1900-01-01 09:30:00.171320563,MSFT,9,106.49,@ I,0
1900-01-01 09:30:00.171329137,MSFT,9,106.49,@ Q,0
1900-01-01 09:30:00.369148039,MSFT,20,106.37,@F I,0
1900-01-01 09:30:00.378907585,MSFT,35,106.49,@ I,0
1900-01-01 09:30:00.393842815,MSFT,280,106.37,@,0


We will do our analysis on 1 minute bars. The work done in this notebook can be extended to any time frame (even beyond intraday). As we have seen before we will work with log returns, not returns or price.

In [7]:
AAPL_1Minbars_vol = bs.process_time_bars(AAPL_Price_Vol_Data,frequency='1Min')
IBM_1Minbars_vol = bs.process_time_bars(IBM_Price_Vol_Data,frequency='1Min')
MSFT_1Minbars_vol = bs.process_time_bars(MSFT_Price_Vol_Data,frequency='1Min')

In [8]:
AAPL_1Minbars = get_log_return(AAPL_1Minbars_vol)
IBM_1Minbars = get_log_return(IBM_1Minbars_vol)
MSFT_1Minbars = get_log_return(MSFT_1Minbars_vol)

Ufortunately, the calendars of these match. Often a fair amount of work is needed to align calendars although it is usually worth the work. In our case there are just a few issues with end of the day bars.

In [9]:
AAPL_1Minbars[0:5]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,204.39,204.39,203.02,203.24,2055463,-0.005642
1900-01-01 09:32:00,203.26,204.39,202.28,202.4532,758040,-0.003977
1900-01-01 09:33:00,202.49,204.39,201.63,202.11,903186,-0.001878
1900-01-01 09:34:00,202.16,204.39,201.7,202.42,605879,0.001285
1900-01-01 09:35:00,202.42,204.39,202.39,203.1063,577415,0.003385


In [11]:
AAPL_1Minbars[-20:-1]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 15:57:00,201.675,201.675,201.47,201.635,184265,-0.0001983586
1900-01-01 15:58:00,201.6328,201.65,201.36,201.46,252416,-0.0008573709
1900-01-01 15:59:00,201.45,201.52,201.31,201.4501,212732,4.96401e-07
1900-01-01 16:00:00,201.45,201.89,201.29,201.59,509096,0.0006947202
1900-01-01 16:01:00,201.57,201.59,201.57,201.59,1961223,9.921619e-05
1900-01-01 16:02:00,,,,,0,
1900-01-01 16:03:00,,,,,0,
1900-01-01 16:04:00,,,,,0,
1900-01-01 16:05:00,,,,,0,
1900-01-01 16:06:00,,,,,0,


In [10]:
IBM_1Minbars[0:5]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,116.34,116.34,115.963,116.1,24211,-0.002065
1900-01-01 09:32:00,116.1,116.7562,116.08,116.4811,39270,0.003277
1900-01-01 09:33:00,116.5,116.63,116.1,116.4,21832,-0.000859
1900-01-01 09:34:00,116.445,116.63,116.39,116.63,20725,0.001587
1900-01-01 09:35:00,116.5598,116.89,116.5598,116.89,27293,0.002829


In [12]:
IBM_1Minbars[-20:-1]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 15:43:00,120.13,120.13,120.1,120.12,18444,-8.3e-05
1900-01-01 15:44:00,120.12,120.1447,120.1,120.1301,14904,8.4e-05
1900-01-01 15:45:00,120.1432,120.1652,120.13,120.16,19340,0.00014
1900-01-01 15:46:00,120.16,120.16,120.09,120.09,41443,-0.000583
1900-01-01 15:47:00,120.08,120.12,120.06,120.07,51340,-8.3e-05
1900-01-01 15:48:00,120.067,120.26,120.06,120.19,38291,0.001024
1900-01-01 15:49:00,120.18,120.24,120.14,120.154,31880,-0.000216
1900-01-01 15:50:00,120.155,120.17,120.07,120.08,26119,-0.000624
1900-01-01 15:51:00,120.08,120.1194,120.04,120.04,24983,-0.000333
1900-01-01 15:52:00,120.04,120.12,120.04,120.1157,26512,0.00063


In [13]:
MSFT_1Minbars[0:5]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,106.49,106.5,106.12,106.42,675660,-0.000658
1900-01-01 09:32:00,106.4296,106.43,106.14,106.18,179764,-0.002348
1900-01-01 09:33:00,106.18,106.38,106.13,106.23,245306,0.000471
1900-01-01 09:34:00,106.23,106.32,106.08,106.265,230709,0.000329
1900-01-01 09:35:00,106.26,106.5294,106.18,106.215,254629,-0.000424


In [14]:
MSFT_1Minbars[-20:-1]

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 15:57:00,107.67,107.71,107.5855,107.7,143773,0.000279
1900-01-01 15:58:00,107.7,107.71,107.65,107.655,122849,-0.000418
1900-01-01 15:59:00,107.66,107.67,107.52,107.5592,320923,-0.000937
1900-01-01 16:00:00,107.56,107.59,107.47,107.49,356611,-0.000651
1900-01-01 16:01:00,107.49,107.51,107.49,107.51,1864837,0.000186
1900-01-01 16:02:00,,,,,0,
1900-01-01 16:03:00,,,,,0,
1900-01-01 16:04:00,,,,,0,
1900-01-01 16:05:00,,,,,0,
1900-01-01 16:06:00,,,,,0,


In order to align the calendars we will restrict to the trading day, 9:30 to 4:00.

In [15]:
to_drop = []
for time in AAPL_1Minbars.index:
    if time > pd.Timestamp('1900-01-01 16:00:00', freq='T'):
        to_drop.append(time)
        
AAPL_1Minbars = AAPL_1Minbars.drop(to_drop)

In [16]:
AAPL_1Minbars

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,204.3900,204.390,203.020,203.2400,2055463,-5.642387e-03
1900-01-01 09:32:00,203.2600,204.390,202.280,202.4532,758040,-3.977199e-03
1900-01-01 09:33:00,202.4900,204.390,201.630,202.1100,903186,-1.878399e-03
1900-01-01 09:34:00,202.1600,204.390,201.700,202.4200,605879,1.285284e-03
1900-01-01 09:35:00,202.4200,204.390,202.390,203.1063,577415,3.384741e-03
...,...,...,...,...,...,...
1900-01-01 15:56:00,201.4000,201.680,201.312,201.6700,354583,1.339718e-03
1900-01-01 15:57:00,201.6750,201.675,201.470,201.6350,184265,-1.983586e-04
1900-01-01 15:58:00,201.6328,201.650,201.360,201.4600,252416,-8.573709e-04
1900-01-01 15:59:00,201.4500,201.520,201.310,201.4501,212732,4.964010e-07


In [17]:
to_drop = []
for time in IBM_1Minbars.index:
    if time > pd.Timestamp('1900-01-01 16:00:00', freq='T'):
        to_drop.append(time)
        
IBM_1Minbars = IBM_1Minbars.drop(to_drop)

In [18]:
IBM_1Minbars

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,116.3400,116.3400,115.9630,116.1000,24211,-0.002065
1900-01-01 09:32:00,116.1000,116.7562,116.0800,116.4811,39270,0.003277
1900-01-01 09:33:00,116.5000,116.6300,116.1000,116.4000,21832,-0.000859
1900-01-01 09:34:00,116.4450,116.6300,116.3900,116.6300,20725,0.001587
1900-01-01 09:35:00,116.5598,116.8900,116.5598,116.8900,27293,0.002829
...,...,...,...,...,...,...
1900-01-01 15:56:00,120.1700,120.2500,120.1072,120.2300,36757,0.000499
1900-01-01 15:57:00,120.2200,120.2900,120.2200,120.2800,40464,0.000499
1900-01-01 15:58:00,120.2750,120.2900,120.2300,120.2366,38395,-0.000319
1900-01-01 15:59:00,120.2300,120.2400,120.1400,120.1500,43922,-0.000666


In [19]:
to_drop = []
for time in MSFT_1Minbars.index:
    if time > pd.Timestamp('1900-01-01 16:00:00', freq='T'):
        to_drop.append(time)
        
MSFT_1Minbars = MSFT_1Minbars.drop(to_drop)

In [20]:
MSFT_1Minbars

Unnamed: 0_level_0,open,high,low,close,Trade Volume,log returns
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:31:00,106.4900,106.5000,106.1200,106.4200,675660,-0.000658
1900-01-01 09:32:00,106.4296,106.4300,106.1400,106.1800,179764,-0.002348
1900-01-01 09:33:00,106.1800,106.3800,106.1300,106.2300,245306,0.000471
1900-01-01 09:34:00,106.2300,106.3200,106.0800,106.2650,230709,0.000329
1900-01-01 09:35:00,106.2600,106.5294,106.1800,106.2150,254629,-0.000424
...,...,...,...,...,...,...
1900-01-01 15:56:00,107.6400,107.7400,107.6350,107.6650,200114,0.000232
1900-01-01 15:57:00,107.6700,107.7100,107.5855,107.7000,143773,0.000279
1900-01-01 15:58:00,107.7000,107.7100,107.6500,107.6550,122849,-0.000418
1900-01-01 15:59:00,107.6600,107.6700,107.5200,107.5592,320923,-0.000937


Although Pandas dataframes are very useful for a number of applications, they are probably not the best data structure to use in production code. There are a number of reasons for this and what you should use in ti's place depends on the production application. Below we create a standard type of structure that is typically used in production.

In [21]:
mnLogReturns = np.vstack((np.array(AAPL_1Minbars['log returns']),np.array(IBM_1Minbars['log returns']),
                                 np.array(MSFT_1Minbars['log returns'])))

In [22]:
mnLogReturns

array([[-5.64238673e-03, -3.97719898e-03, -1.87839897e-03, ...,
        -8.57370861e-04,  4.96400969e-07,  6.94720155e-04],
       [-2.06504978e-03,  3.27713938e-03, -8.58737708e-04, ...,
        -3.19319320e-04, -6.65612804e-04, -5.82774857e-04],
       [-6.57554859e-04, -2.34796655e-03,  4.70787636e-04, ...,
        -4.17914612e-04, -9.36719469e-04, -6.51011416e-04]])

In [23]:
AAPL_1Minbars.index

DatetimeIndex(['1900-01-01 09:31:00', '1900-01-01 09:32:00',
               '1900-01-01 09:33:00', '1900-01-01 09:34:00',
               '1900-01-01 09:35:00', '1900-01-01 09:36:00',
               '1900-01-01 09:37:00', '1900-01-01 09:38:00',
               '1900-01-01 09:39:00', '1900-01-01 09:40:00',
               ...
               '1900-01-01 15:51:00', '1900-01-01 15:52:00',
               '1900-01-01 15:53:00', '1900-01-01 15:54:00',
               '1900-01-01 15:55:00', '1900-01-01 15:56:00',
               '1900-01-01 15:57:00', '1900-01-01 15:58:00',
               '1900-01-01 15:59:00', '1900-01-01 16:00:00'],
              dtype='datetime64[ns]', name='Date_Time', length=390, freq='T')

In [24]:
vxCalendar = AAPL_1Minbars.index

In [25]:
dbLogReturns = [vxCalendar,['AAPL','IBM','MSFT'],mnLogReturns.transpose()]

In [26]:
dbLogReturns

[DatetimeIndex(['1900-01-01 09:31:00', '1900-01-01 09:32:00',
                '1900-01-01 09:33:00', '1900-01-01 09:34:00',
                '1900-01-01 09:35:00', '1900-01-01 09:36:00',
                '1900-01-01 09:37:00', '1900-01-01 09:38:00',
                '1900-01-01 09:39:00', '1900-01-01 09:40:00',
                ...
                '1900-01-01 15:51:00', '1900-01-01 15:52:00',
                '1900-01-01 15:53:00', '1900-01-01 15:54:00',
                '1900-01-01 15:55:00', '1900-01-01 15:56:00',
                '1900-01-01 15:57:00', '1900-01-01 15:58:00',
                '1900-01-01 15:59:00', '1900-01-01 16:00:00'],
               dtype='datetime64[ns]', name='Date_Time', length=390, freq='T'),
 ['AAPL', 'IBM', 'MSFT'],
 array([[-5.64238673e-03, -2.06504978e-03, -6.57554859e-04],
        [-3.97719898e-03,  3.27713938e-03, -2.34796655e-03],
        [-1.87839897e-03, -8.58737708e-04,  4.70787636e-04],
        ...,
        [-8.57370861e-04, -3.19319320e-04, -4.17914612e-0

In quantitative trading volume contains a substantial amount of information as well. As such we will also create a data structure to hold our volume data.

In [27]:
mnTradeVolume = np.vstack((np.array(AAPL_1Minbars['Trade Volume']),np.array(IBM_1Minbars['Trade Volume']),
                                 np.array(MSFT_1Minbars['Trade Volume'])))

In [28]:
dbTradeVolume = [vxCalendar,['AAPL','IBM','MSFT'],mnTradeVolume.transpose()]

In [29]:
dbTradeVolume

[DatetimeIndex(['1900-01-01 09:31:00', '1900-01-01 09:32:00',
                '1900-01-01 09:33:00', '1900-01-01 09:34:00',
                '1900-01-01 09:35:00', '1900-01-01 09:36:00',
                '1900-01-01 09:37:00', '1900-01-01 09:38:00',
                '1900-01-01 09:39:00', '1900-01-01 09:40:00',
                ...
                '1900-01-01 15:51:00', '1900-01-01 15:52:00',
                '1900-01-01 15:53:00', '1900-01-01 15:54:00',
                '1900-01-01 15:55:00', '1900-01-01 15:56:00',
                '1900-01-01 15:57:00', '1900-01-01 15:58:00',
                '1900-01-01 15:59:00', '1900-01-01 16:00:00'],
               dtype='datetime64[ns]', name='Date_Time', length=390, freq='T'),
 ['AAPL', 'IBM', 'MSFT'],
 array([[2055463,   24211,  675660],
        [ 758040,   39270,  179764],
        [ 903186,   21832,  245306],
        ...,
        [ 252416,   38395,  122849],
        [ 212732,   43922,  320923],
        [ 509096,  111184,  356611]])]

## Exponential Smoothing

Exponential smoothing (http://en.wikipedia.org/wiki/Exponential_smoothing) is a smoothing and forecasting method based on weights that decay exponentially the further back in time observations are observed. If we are smoothing a process $x_{t-1}$ to produce a forecast $s_t$], we begin with a smoothing parameter $0 < \lambda < 1$ and initial condition $s_0$:
$$s_1 = \lambda x_0 + (1-\lambda)s_0$$
$$s_t = \lambda x_{t-1} + (1-\lambda)s_{t-1}$$
Thus, the forecast or smooth for the present time $t$ (i.e., $s_t$)  is a weighted average of the previous observation and the previous forecast. In most cases, the initial condition is assumed to be 0, but if there is a better estimate then there is no reason not to use it.

If we assume history trails off into infinity, then the initial condition goes away and the iteration above is equivalent to:
$$s_t = \lambda \sum_{i=1}^{t-1} (1-\lambda)^i x_{t-i} + (1-\lambda)^t s_0$$

This form motivates the name exponential smoothing. The forecast employs an exponentially damped set of weights applied to the history. Further, examining the weights themselves it is straightforward to show:
$$\lambda \sum_{i=1}^{t-1} (1-\lambda)^i = 1-(1-\lambda)^t \rightarrow \lambda \sum_{i=1}^{t} (1-\lambda)^i=1$$

The weights add to unity; hence, this estimate is also called an exponentially smoothed moving average. Exponential smoothing is extremely easy to apply. It only requires a single parameter and  a single value $s_t$ to bootstrap the forecast from one time step to the next; nevertheless, it accomplishes the relatively sophisticated and intuitively appealing task of exponentially down weighting the past to estimate the future.

The parameter $\lambda$ has a good interpretation as the model's half life $h$, i.e. the period at which the weight placed on an observation is half that of the initial weight. To see this take
$$(1-\lambda)^h = \frac{1}{2} \rightarrow \lambda = 1 - 2^{-1/h} \rightarrow h = -\frac{ln(2)}{ln(1-\lambda)}$$
Thus, a half-life of $h = 20$ periods (roughly a month for daily data) sets $\lambda \approx 0.034$, and a smoothing parameter $\lambda = 0.1$ sets the model's half life to $h \approx 6.6$ periods.

In [30]:
es.xExponentialSmoothLambda(20)

0.0340636710751544

In [31]:
es.xExponentialSmoothHalfLife(0.1)

6.578813478960585

We will develop our own function to perform exponential smoothing. This function we employ will initialize using the initial value of the vector. May languages have built-in exponential smoothing functions. A material problem with the built-in functions is that they only work scalars or vectors. As will be covered shortly, smoothing linear regression models requires that the exponential smoothing function also extends to working with vectors of matrices.

In [32]:
es.xExponentialSmooth([1,2,3,4,5],0.1)

[1.0, 1.1, 1.29, 1.561, 1.9049]

In [33]:
es.xExponentialSmooth(np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5]]),0.1)

[array([1., 1., 1.]),
 array([1.1, 1.1, 1.1]),
 array([1.29, 1.29, 1.29]),
 array([1.561, 1.561, 1.561]),
 array([1.9049, 1.9049, 1.9049])]

In [34]:
es.xExponentialSmooth([np.identity(3),2*np.identity(3),3*np.identity(3),4*np.identity(3)],0.1)

[array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]),
 array([[1.1, 0. , 0. ],
        [0. , 1.1, 0. ],
        [0. , 0. , 1.1]]),
 array([[1.29, 0.  , 0.  ],
        [0.  , 1.29, 0.  ],
        [0.  , 0.  , 1.29]]),
 array([[1.561, 0.   , 0.   ],
        [0.   , 1.561, 0.   ],
        [0.   , 0.   , 1.561]])]

The ability to smooth matrices is important because it allows us to compute exponentially smoothed regression models by smoothing the information matrix of the regression. It is important to note that both functions have lookahead because the smooth for time $t$ includes data up to and including time $t$. We have to be careful in aligning results so that lookahead is avoided.

## Exponentially Smoothed Regression

This section covers the computation of an exponentially smoothed simple linear regression without lookahead. The extension to multiple linear regression is straightforward. A simple linear regression is the solution of:
$$\left(
\begin{array}{c}
 \Sigma _i y_i \\
 \Sigma _i x_i y_i \\
\end{array}
\right)=\left(
\begin{array}{c}
 \alpha  \\
 \beta  \\
\end{array}
\right) \left(
\begin{array}{cc}
 n & \Sigma _i x_i \\
 \Sigma _i x_i & \Sigma _i x_i^2 \\
\end{array}
\right)$$

We introduce some notation and note that the computation can be re-expressed as:
$$m=\left(
\begin{array}{c}
 \Sigma _i y_i \\
 \Sigma _i x_i y_i \\
\end{array}
\right)=\sum _{i=1}^n y_i \left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right)$$
$$M=\left(
\begin{array}{cc}
 n & \Sigma _i x_i \\
 \Sigma _i x_i & \Sigma _i x_i^2 \\
\end{array}
\right)=\sum _{i=1}^n \left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right) \left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right){}^T$$

If we wished to write the equations for a regression that looked back over $m$ periods, giving each period a weight of $w_i$, then the computation of the regression without lookahead would be:
$$m_t=\sum _{i=t-m}^{t-1} w_i\left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right)y_i$$
$$M_t=\sum _{i=t-m}^{t-1} w_i\left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right){} \left(
\begin{array}{c}
 1 \\
 x_i \\
\end{array}
\right)^T$$
$$\left(
\begin{array}{c}
 \alpha _t \\
 \beta _t \\
\end{array}
\right)=\frac{m_t}{M_t}$$

Exponential smoothing can be viewed as a moving average whose weights decay exponentially as one goes further back in time. Clearly, then the exponentially smoothed version of the above is:
$$m_t=(1-\lambda ) m_{t-1}+\lambda  y_{t-1}\left(
\begin{array}{c}
 1 \\
 x_{t-1} \\
\end{array}
\right)$$
$$M_t=(1-\lambda ) M_{t-1}+\lambda  \left(
\begin{array}{c}
 1 \\
 x_{t-1} \\
\end{array}
\right) \left(
\begin{array}{c}
 1 \\
 x_{t-1} \\
\end{array}
\right){}^T$$
$$\left(
\begin{array}{c}
 \alpha _t \\
 \beta _t \\
\end{array}
\right)=\frac{m_t}{M_t}$$

Again, note that there is no lookahead the way the equations above are constructed; the estimates of $(\alpha_t,\beta_t)^T$ use data only from periods $t-1$ and earlier. The version of exponential smoothing we've created, unlike many built-in functions, handles  arbitrary objects, the only requirement being that the observation and the smooth conform so that the basic convex combination works.

## Volatility Smoothing

Volatility smoothing is important. We want to use data across time and across a number of instruments, but the raw data displays large variations in volatility both cross-temporally and cross-sectionally.

In [35]:
[np.std(dbLogReturns[2][0:-1,0]),np.std(dbLogReturns[2][0:-1,1]),np.std(dbLogReturns[2][0:-1,2])]

[0.0008723484387600651, 0.0007433063647429512, 0.0005775364235226612]

In [37]:
[np.mean(dbLogReturns[2][0:-1,0]),np.mean(dbLogReturns[2][0:-1,1]),np.mean(dbLogReturns[2][0:-1,2])]

[-3.224640805801676e-05, 7.210411580413896e-05, 1.823756545518432e-05]

In [36]:
dbLogReturns[1]

['AAPL', 'IBM', 'MSFT']

$Var(r) = \mathbb{E}(r^2) - \mathbb{E}(r)^2$

We can smooth these using exponential smoothing and observing the fact that at the intraday level
$$\mathbb{E}[r^2] \sim Var[r]$$

We can compute these normalized results and put them into a db. It is important to note the use of indexing in order to prevent look ahead.

In [50]:
mnVarSmoothedLogReturns = []
for i in range(len(dbLogReturns[1])):
    VarSmoothed = (dbLogReturns[2].transpose()[i][1:-1])/np.sqrt(es.xExponentialSmooth(dbLogReturns[2].transpose()[i][0:-2]**2,0.1))
    mnVarSmoothedLogReturns.append(VarSmoothed)
mnVarSmoothedLogReturns = np.array(mnVarSmoothedLogReturns)

In [51]:
dbVarSmoothedLogReturns = [vxCalendar,['AAPL','IBM','MSFT'],mnVarSmoothedLogReturns.transpose()]

In [75]:
dbVarSmoothedLogReturns

[DatetimeIndex(['1900-01-01 09:31:00', '1900-01-01 09:32:00',
                '1900-01-01 09:33:00', '1900-01-01 09:34:00',
                '1900-01-01 09:35:00', '1900-01-01 09:36:00',
                '1900-01-01 09:37:00', '1900-01-01 09:38:00',
                '1900-01-01 09:39:00', '1900-01-01 09:40:00',
                ...
                '1900-01-01 15:51:00', '1900-01-01 15:52:00',
                '1900-01-01 15:53:00', '1900-01-01 15:54:00',
                '1900-01-01 15:55:00', '1900-01-01 15:56:00',
                '1900-01-01 15:57:00', '1900-01-01 15:58:00',
                '1900-01-01 15:59:00', '1900-01-01 16:00:00'],
               dtype='datetime64[ns]', name='Date_Time', length=390, freq='T'),
 ['AAPL', 'IBM', 'MSFT'],
 array([[-7.04878835e-01,  1.58695418e+00, -3.57075386e+00],
        [-3.41613565e-01, -3.87465899e-01,  4.85467991e-01],
        [ 2.44809029e-01,  7.48799330e-01,  3.53468291e-01],
        ...,
        [-3.21888445e-01,  1.03551323e+00,  5.49828362e-0

If we have done a reasonable job of estimating volatility then the standard deviation should be near unity.

In [52]:
[np.std(dbVarSmoothedLogReturns[2][0:-1,0]),np.std(dbVarSmoothedLogReturns[2][0:-1,1]),np.std(dbVarSmoothedLogReturns[2][0:-1,2])]

[0.9879617756282181, 1.0126861969268455, 1.072951684365099]

This can be done in the context of volume as well. One of the overall ideas here is that certain explanatory variable are on different scales. It is these differences in scale that can produce effects across assets that will fool models.

In [53]:
[np.mean(dbTradeVolume[2][0:-1,0]),np.mean(dbTradeVolume[2][0:-1,1]),np.mean(dbTradeVolume[2][0:-1,2])]

[154625.3084832905, 20112.987146529562, 63483.12596401028]

In [54]:
[np.std(dbTradeVolume[2][0:-1,0]),np.std(dbTradeVolume[2][0:-1,1]),np.std(dbTradeVolume[2][0:-1,2])]

[177304.51914942756, 12733.801653976585, 53021.110354564626]

As above we will use standard deviation to bring our scales into alignment, but any estiamte of scale could be used in it's place. Just think about why you are using the estimator you've picked. 

In [70]:
mnVarSmoothedTradeVolume = []
for i in range(len(dbTradeVolume[1])):
    VarSmooth = (dbTradeVolume[2].transpose()[i][1:-1])/np.sqrt(es.xExponentialSmooth(dbTradeVolume[2].transpose()[i][0:-2]**2,.9))
    mnVarSmoothedTradeVolume.append(VarSmooth)
mnVarSmoothedTradeVolume = np.array(mnVarSmoothedTradeVolume)

In [71]:
dbVarSmoothedTradeVolume = [vxCalendar,['AAPL','IBM','MSFT'],mnVarSmoothedTradeVolume.transpose()]

In [72]:
[np.std(dbVarSmoothedTradeVolume[2][0:-1,0]),np.std(dbVarSmoothedTradeVolume[2][0:-1,1]),np.std(dbVarSmoothedTradeVolume[2][0:-1,2])]

[0.4848679158158858, 0.5611998722542082, 0.4963947285510212]

In [73]:
dbVarSmoothedTradeVolume[2]

array([[0.36879282, 1.62199   , 0.26605689],
       [0.93173623, 0.57402231, 0.89731615],
       [0.66578665, 0.86532652, 0.92931884],
       ...,
       [0.54199257, 1.12256306, 0.73459793],
       [1.22995688, 0.95881676, 0.82019523],
       [0.85744103, 1.13896479, 2.55102018]])

In [74]:
dbTradeVolume[2]

array([[2055463,   24211,  675660],
       [ 758040,   39270,  179764],
       [ 903186,   21832,  245306],
       ...,
       [ 252416,   38395,  122849],
       [ 212732,   43922,  320923],
       [ 509096,  111184,  356611]])