In [None]:
# This is just a preamble that sets a bunch of options up.

# render graphs inline
%matplotlib inline

import matplotlib

import numpy as np
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)

In [None]:
path = '../data/GLD.quotes.csv'

### First, we have to be able to work with our data

In [None]:
import pandas as pd

pd.set_option('precision', 5)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

dataset = pd.read_csv(path)
dataset.head()

In [None]:
# We can manipulate the data!
dataset['range'] = dataset['high'] - dataset['low']
dataset.head()

In [None]:
# Sort by date
df = dataset[1:].copy() # Drop today's date because the format is off
df.date = pd.to_datetime(df.date, format='%Y/%m/%d')
df.set_index('date', inplace=True)
df.sort_index(inplace=True)
df.head()

In [None]:
# Make sure everything is a number!
for c in df.columns:
    df[c] = df[c].astype(float)

In [None]:
# Feature Scale Volume!
df['volume(MM)'] = df['volume'] / 1000000
df.drop('volume', inplace=True, axis='columns')
df.head()

In [None]:
# What does the volume look like?
df['volume(MM)'].plot()

## Plot prices!

In [None]:
dataset[['open', 'close', 'high', 'low']].plot()

# Let's try some simple predictions using ML!

In [None]:
# split columns into two sets. 
# Explanatory variables, X. Dependent Variable, Y

indeps = ['volume(MM)', 'open', 'high', 'low']
dep = 'close'

X = df[indeps]
Y = df[dep]

display(X.head())
display(Y.head())

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.8)

In [None]:
test.head()

In [None]:
prep_data = lambda df: (df[indeps], df[dep])
train_X, train_Y = prep_data(train)
test_X, test_Y = prep_data(test)

In [None]:
# Import the algorithm!
from sklearn.linear_model import LinearRegression

In [None]:
# 1. Create the model
model = LinearRegression()

# 2. Fit the model
model.fit(train_X, train_Y)


# 3. Test it!
print(model.score(train_X, train_Y))
print(model.score(test_X, test_Y))

## How does the machine predict?

In [None]:
# Print the model equation

coefs = ["{:.3} * {}".format(model.coef_[i], c) for i, c in enumerate(indeps)]

eq = (
    "close = " + 
    ' + '.join(coefs) + 
    " +  {}".format(model.intercept_)
)

print("Linear regression model:")
print(eq)

# Maybe we can use previous days!

In [None]:
days_to_look_back = 5

for i in range(1, days_to_look_back + 1):
    df["close-{}".format(i)] = df['close'].shift(i)

df.head()

In [None]:
# Drop NaN's
df.dropna(inplace=True)
df.head()

In [None]:
# Split the data into training and test sets

indeps = [c for c in df.columns if c.startswith("close-")]
train, test = train_test_split(df, train_size=0.8)
X2, Y2 = prep_data(train)
X3, Y3 = prep_data(test)

print('Train size:', len(X2.shape))
print('Test size:', len(X3.shape))

model2 = LinearRegression()
model2.fit(X2, Y2)

In [None]:
# Scatter the actual prices vs. predicted prices

Y3_pred = model2.predict(X3)
plt.scatter(x = Y3_pred, y = Y3, c = 'r')
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()

In [None]:
print(model2.score(X2, Y2))
print(model2.score(X3, Y3))

# Which days are smart investments?

In [None]:
# What are returns?
df['return'] = (df['close'] - df['open'])/df['open']

In [None]:
# Define previous day open-to-close returns

days_to_look_back = 5
prev_return = df['return']

for i in range(1, days_to_look_back + 1):
    prev_return = df['return'].shift(i)
    df['prev_return_%d' % i] = prev_return
df.dropna(inplace=True)
print(df.head())

In [None]:
# Train a model for today's return based on previous day returns

X = df[['prev_return_%d' % i for i in range(1, days_to_look_back + 1)]]
Y = df['return']

train_len = int(float(len(df)) * 0.8)

X_train = X[:train_len]
Y_train = Y[:train_len]
X_test = X[train_len:]
Y_test = Y[train_len:]

print('Train size:', len(X_train))
print('Test size:', len(X_test))

model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
print(model.score(X_train, Y_train))
print(model.score(X_test, Y_test))

In [None]:
# Scatter the actual returns vs. predicted returns
Y_pred = model.predict(X_test)
Y_pred = pd.Series(Y_pred, index = Y_test.index)
plt.scatter(x=Y_pred, y=Y_test, c='r')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
df['return'].plot()

In [None]:
df['close'].plot()

## What have we learned?