In [1]:
import pandas as pd
import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt

import matplotlib.dates as mdates

from numpy import diag, allclose, corrcoef, array, eye, ones, sqrt, zeros

from numpy.linalg import eig, matrix_rank, inv, cholesky, qr, norm, inv, svd

In [2]:
# import data 

## SP futures
es = pd.read_csv('data/es.csv')

## VIX futures
vix1 = pd.read_csv('data/vix1.csv')
vix2 = pd.read_csv('data/vix2.csv')
vix3 = pd.read_csv('data/vix3.csv')

## SP volatility
month1_vol = pd.read_csv('data/1month_vol.csv')
month2_vol = pd.read_csv('data/2month_vol.csv')
month3_vol = pd.read_csv('data/3month_vol.csv')

## SP skew
month1_skew = pd.read_csv('data/1month_skew.csv')
month2_skew = pd.read_csv('data/2month_skew.csv')
month3_skew = pd.read_csv('data/3month_skew.csv')

In [3]:
# clean 

## set date as index
es.set_index('Date', inplace=True)

vix1.set_index('Date', inplace=True)
vix2.set_index('Date', inplace=True)
vix3.set_index('Date', inplace=True)

month1_vol.set_index('Date', inplace=True)
month2_vol.set_index('Date', inplace=True)
month3_vol.set_index('Date', inplace=True)

month1_skew.set_index('Date', inplace=True)
month2_skew.set_index('Date', inplace=True)
month3_skew.set_index('Date', inplace=True)

## drop bad data
vix2 = vix2.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'], axis=1)
vix3 = vix3.drop(['Unnamed: 6', 'Unnamed: 7'], axis=1)

## reverse time series
es = es.iloc[::-1]

vix1 = vix1.iloc[::-1]
vix2 = vix2.iloc[::-1]
vix3 = vix3.iloc[::-1]

month1_vol = month1_vol.iloc[::-1]
month2_vol = month2_vol.iloc[::-1]
month3_vol = month2_vol.iloc[::-1]

month1_skew = month1_skew.iloc[::-1]
month2_skew = month2_skew.iloc[::-1]
month3_skew = month3_skew.iloc[::-1]

In [4]:
# Data 1
df_es_vix1 = es.merge(vix1, left_index=True, right_index=True)
df_es_vix1 = df_es_vix1.drop(['ES_PX_LAST', 'ES_PX_VOLUME', '1_Future', '1_PX_LAST', '1_PX_VOLUME', 
                              '1_DAYS_UNTIL_EXP'], axis=1)
df_es_vix1.head()

Unnamed: 0_level_0,ES_DAILY_RETURN,1_DAILY_RETURN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1/2/08,-0.012693,0.032007
1/3/08,0.000171,-0.018441
1/4/08,-0.024507,0.0538
1/7/08,-0.001054,-0.02269
1/8/08,-0.017235,0.058872


In [5]:
array = df_es_vix1.to_numpy()

In [6]:
# Linear regression using least squares
A = array[:,0].reshape(-1,1)
b = array[:,1].reshape(-1,1)

In [30]:
x_hat = inv(A.T @ A) @ A.T @ b
p = A @ x_hat
e = b - p
print(x_hat)
print()
print(p)

[[-2.79791763]]

[[ 0.03551258]
 [-0.00047959]
 [ 0.06856936]
 ...
 [ 0.00156192]
 [-0.00026049]
 [ 0.0101572 ]]


In [31]:
r2 = 1 - sum((A - A.mean())**2) / sum(e**2)
r2

array([0.85406674])

In [32]:
# Regression using svd
U, s, V = svd(A, full_matrices=True)

In [34]:
Ur = U[:,:1]
p = Ur @ Ur.T @ b
e = b - p2

print(p)

[[ 0.03551258]
 [-0.00047959]
 [ 0.06856936]
 ...
 [ 0.00156192]
 [-0.00026049]
 [ 0.0101572 ]]


In [35]:
r2 = 1 - sum((A - A.mean())**2) / sum(e**2)
r2

array([0.85406674])

In [12]:
# Predict

In [36]:
x_hat = inv(A[:2000].T @ A[:2000]) @ A[:2000].T @ b[:2000]
p = A[:2000] @ x_hat
e = b[:2000] - p
print(x_hat)
print()
print(p)

[[-2.62507698]]

[[ 0.0333188 ]
 [-0.00044996]
 [ 0.06433351]
 ...
 [-0.02555971]
 [ 0.02312568]
 [ 0.03814969]]


In [37]:
r2 = 1 - sum((A[:2000] - A[:2000].mean())**2) / sum(e**2)
r2

array([0.8184918])

In [40]:
# predict
p = A[-522:] @ x_hat
e = b[-522:] - p

print(p)

[[-0.0476705 ]
 [ 0.0094269 ]
 [ 0.02806726]
 [ 0.02135764]
 [-0.00932018]
 [ 0.05059926]
 [-0.01142899]
 [-0.03511374]
 [-0.03400858]
 [ 0.04782657]
 [ 0.00584569]
 [ 0.01903767]
 [-0.02735812]
 [-0.02191862]
 [ 0.00223764]
 [ 0.00319936]
 [-0.03075136]
 [ 0.02311309]
 [ 0.02427669]
 [ 0.03417565]
 [-0.00359331]
 [ 0.03360046]
 [ 0.07005493]
 [ 0.0291977 ]
 [-0.0037766 ]
 [-0.01474184]
 [ 0.05931992]
 [-0.04604174]
 [ 0.05416064]
 [ 0.00280008]
 [ 0.02522765]
 [-0.00849081]
 [-0.05395443]
 [ 0.04008279]
 [-0.03614262]
 [ 0.02907522]
 [-0.00805024]
 [-0.06874122]
 [-0.00170018]
 [ 0.04587513]
 [-0.01521784]
 [ 0.0010316 ]
 [ 0.04472022]
 [ 0.03254662]
 [ 0.00531536]
 [ 0.00213046]
 [ 0.03162743]
 [-0.04855925]
 [-0.04308616]
 [-0.04725486]
 [ 0.00853295]
 [ 0.00273945]
 [-0.02982263]
 [ 0.027454  ]
 [-0.01952367]
 [-0.02753934]
 [ 0.01043032]
 [ 0.01790363]
 [-0.06598406]
 [-0.00729925]
 [-0.0092642 ]
 [-0.00593461]
 [-0.00526331]
 [ 0.02363751]
 [-0.01060102]
 [ 0.00032995]
 [-0.04124

In [41]:
r2 = 1 - sum((A[-522:] - A[-522].mean())**2) / sum(e**2)
r2

array([0.74014075])

In [50]:
# Predict using svd
U, s, V = svd(A[:2000], full_matrices=True)
Ur = U[:,:1]
p = Ur @ Ur.T @ b[:2000]
e = b[:2000] - p

print(x_hat)
print()
print(p)

[[-2.62507698]]

[[ 0.0333188 ]
 [-0.00044996]
 [ 0.06433351]
 ...
 [-0.02555971]
 [ 0.02312568]
 [ 0.03814969]]


In [44]:
r2 = 1 - sum((A[:2000] - A[:2000].mean())**2) / sum(e**2)
r2

array([0.8184918])

In [49]:
U, s, V = svd(A[-522:], full_matrices=True)
U2, s2, V2 = svd(b)
Ur = U[:,:1]
p = Ur @ x_hat
e = b[-522:] - p

print(p)

[[-0.31106497]
 [ 0.06151347]
 [ 0.18314765]
 [ 0.13936529]
 [-0.0608171 ]
 [ 0.33017606]
 [-0.07457777]
 [-0.22912815]
 [-0.22191662]
 [ 0.31208337]
 [ 0.03814497]
 [ 0.12422675]
 [-0.17852032]
 [-0.14302585]
 [ 0.01460133]
 [ 0.02087684]
 [-0.20066226]
 [ 0.15082015]
 [ 0.15841302]
 [ 0.22300684]
 [-0.02344748]
 [ 0.21925355]
 [ 0.45713034]
 [ 0.19052414]
 [-0.02464347]
 [-0.09619515]
 [ 0.38708109]
 [-0.3004368 ]
 [ 0.35341514]
 [ 0.01827142]
 [ 0.16461831]
 [-0.05540522]
 [-0.35206957]
 [ 0.26155274]
 [-0.23584192]
 [ 0.18972492]
 [-0.05253032]
 [-0.44855798]
 [-0.0110942 ]
 [ 0.29934961]
 [-0.09930115]
 [ 0.00673151]
 [ 0.29181346]
 [ 0.21237687]
 [ 0.03468436]
 [ 0.0139019 ]
 [ 0.20637892]
 [-0.31686432]
 [-0.28115067]
 [-0.30835278]
 [ 0.05568019]
 [ 0.01787576]
 [-0.19460201]
 [ 0.17914596]
 [-0.12739806]
 [-0.17970282]
 [ 0.06806113]
 [ 0.11682677]
 [-0.43056664]
 [-0.04762991]
 [-0.0604518 ]
 [-0.03872522]
 [-0.03434477]
 [ 0.15424217]
 [-0.06917495]
 [ 0.00215302]
 [-0.26916

In [48]:
r2 = 1 - sum((A[-522:] - A[-522].mean())**2) / sum(e**2)
r2

array([0.94723047])

In [21]:
# sklearn

In [51]:
array = df_es_vix1.to_numpy()

In [52]:
X = array[:,0].reshape(-1,1)
y = array[:,1].reshape(-1,1)

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [54]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(f"R2 Score: {score}")

R2 Score: 0.5164250729225874


In [55]:
print('Weight coefficients: ', model.coef_)
print('y-axis intercept: ', model.intercept_)
print(type(X))
print(type(y))

Weight coefficients:  [[-2.88046794]]
y-axis intercept:  [-0.00120885]
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [56]:
predictions = model.predict(X)
print(f"True output: {y[0]}")
print(f"Predicted output: {predictions[0]}")
print(f"Prediction Error: {predictions[0]-y[0]}")

True output: [0.03200692]
Predicted output: [0.0353515]
Prediction Error: [0.00334458]


In [28]:
# pd.DataFrame({"Predicted": predictions, "Actual": y, "Error": predictions - y})

In [57]:
e = predictions - y
e

array([[ 0.00334458],
       [ 0.01673832],
       [ 0.01558343],
       ...,
       [ 0.00039915],
       [ 0.007353  ],
       [-0.01302368]])