In [None]:
# Datasets grabbed from https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs
# Let's look at the problem of forecasting stocks

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# Let's import one of the datasets from the link.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('aadr.us.csv')

numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']


In [None]:
# Let's visualize some of our data.

import plotly.graph_objects as go
fig = go.Figure(data=[go.Candlestick(x=df.index[:100],
                open=df['Open'][:100],
                high=df['High'][:100],
                low=df['Low'][:100],
                close=df['Close'][:100])])
fig.update_layout(
    title= {
        'text': '',
      'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
      font=dict(
        family="Courier New, monospace",
        size=20,
        color="#7f7f7f"
    )
    )
fig.show()

In [None]:
# If we want to build features for our data, we should use past data to predict future data.
# So let's build features for each row that not only have the present data, 
# but also have the past data to predict the present.
df['Date'] = df.index

# TODO: for each row, add each of the numeric features for the last 14 days as new columns. 
# I propose the naming convention like "Open_past1", "Close_past2", etc...
df = df[[c for c in df.columns if 'prev_date' not in c]]


In [None]:
# Looking at our data, we see a bunch of nan values.
# That's because for the first day of the dataset, we don't know what the past value is
df.head()

In [None]:
# Drop all rows that have any null values
df = # TODO

In [None]:
past_cols = [[c + '_past' + str(days_past) for c in numeric_cols] for days_past in range(1, 14)]
past_cols = [c for cols in past_cols for c in cols]

In [None]:
current_cols = numeric_cols

In [None]:
x = df[past_cols]
y = df[current_cols]

In [None]:
from sklearn.linear_model import Ridge

In [None]:
# Here we try regression
mdl = Ridge().fit(x, y)
yhat = mdl.predict(x)

In [None]:
# What's our eval metric?
# Let's compute the average of High and Low, minus the previous days' close. This represents
# A predictions for how high will the stock be tomorrow relative to it's close today
# You can choose other evaluation metrics.
# Let's also use this for our loss function

In [None]:
y_diff = (df['High'] + df['Low']).iloc[1:].values/2 - df['Close'].iloc[:-1].values
y_diff

In [None]:
# Since we don't know the ground truth y value for the future past the dataset, 
# Throw out the last X value.
x_with_current = df[current_cols + past_cols].iloc[:-1]

In [None]:
# X and y should have the same number of values
x_with_current.shape, y_diff.shape

In [None]:
# Train a linear regression model.
mdl = Ridge().fit(x_with_current, y_diff)
yhat = mdl.predict(x_with_current)

In [None]:
# Check the correlation between our prediction and the real values
predictive_correlation = np.corrcoef(yhat, y_diff)
predictive_correlation[1,0]

In [None]:
# Try again with a GradientBoostingRegressor
treemdl = GradientBoostingRegressor().fit(x_with_current, y_diff)
treeyhat = treemdl.predict(x_with_current)

predictive_correlation_tree = np.corrcoef(treeyhat, y_diff)
predictive_correlation_tree[1,0]

In [None]:
# Try again with a MLPRegressor
nnmdl = MLPRegressor([100]).fit(x_with_current, y_diff)
nnyhat = nnmdl.predict(x_with_current)

predictive_correlation_nn = np.corrcoef(nnyhat, y_diff)
predictive_correlation_nn[1,0]

In [None]:
# Where should we go from here? We still haven't implemented validation.
# We are also still only using one of our stock datasets.
# How would we use the other stock datasets to help our model learn better?
# How would we learn from our histocial data better than just feeding in the past 14 days as features?
# Can we utilize local connections or paramter sharing here?
# Your project is to treat this as a problem to solve, and do it to the best of your abilities.