In [1]:
# Datasets grabbed from https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs
# Let's look at the problem of forecasting stocks

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from data import picture, dataShift, get_x_y, get_mdl_inputs, calc_y, get_trn_val

In [3]:
# Let's import one of the datasets from the link.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('aadr.us.csv')

numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']


In [24]:
draw = True
if draw == True:
    picture(df)

In [5]:
cleaner = dataShift(df, numeric_cols)

In [6]:
x, y, past_cols = get_x_y(numeric_cols, cleaner)

In [7]:
y_diff, x_with_current = get_mdl_inputs(cleaner, numeric_cols, past_cols)

In [8]:

x_with_current.shape, y_diff.shape

((1558, 35), (1558,))

In [9]:
# mdl = Ridge().fit(x_with_current, y_diff)
# yhat = mdl.predict(x_with_current)
# mdl = Ridge().fit(x, y)
# yhat = mdl.predict(x)


# predictive_correlation = np.corrcoef(yhat, y_diff)
# predictive_correlation[1,0]

# treemdl = GradientBoostingRegressor().fit(x_with_current, y_diff)
# treeyhat = treemdl.predict(x_with_current)

# predictive_correlation_tree = np.corrcoef(treeyhat, y_diff)
# predictive_correlation_tree[1,0]


# nnmdl = MLPRegressor([100]).fit(x_with_current, y_diff)
# nnyhat = nnmdl.predict(x_with_current)

# predictive_correlation_nn = np.corrcoef(nnyhat, y_diff)
# predictive_correlation_nn[1,0]

In [10]:
x_trn, y_trn, x_val, y_val = get_trn_val(x_with_current, numeric_cols, 0.6)

In [11]:
mdl = GradientBoostingRegressor().fit(x_trn, y_trn)
yhat = mdl.predict(x_val)
predictive_correlation = np.corrcoef(yhat, y_val)
predictive_correlation[1,0]


0.023253785102871884

In [12]:
mdl = Ridge().fit(x_trn, y_trn)
yhat = mdl.predict(x_trn)
predictive_correlation = np.corrcoef(yhat, y_trn)
predictive_correlation[1,0]

0.224573138745675

In [13]:
predictive_correlation

array([[1.        , 0.22457314],
       [0.22457314, 1.        ]])

In [14]:
def validation(mdl,x_val,y_val, threshold):
    money = 100
    predictions = mdl.predict(x_val)
    assignments = predictions > threshold
    for assignment,y in zip(assignments, y_val):
        money = money * (1-assignment) + assignment * money * (1 + y)
    return money

In [15]:
def baseline(y_val):
    money = 100
    for y in y_val:
        money = money * (1+y) 
    return money

In [16]:
baseline(y_val)

115.47425914555012

In [17]:
profit =[]
validations_correlations = []
baseline_profits=[]
training_correlations =[]
for pct_train in [.5,.6,.7,.8,.9]:
    x_trn, y_trn, x_val, y_val  = get_trn_val(x_with_current, numeric_cols, pct_train, val_percent = 0.1)
    mdl = Ridge().fit(x_trn, y_trn)
    yhat = mdl.predict(x_trn)
    threshold = np.percentile(yhat,30)
    predictive_correlation = np.corrcoef(yhat, y_trn)
    training_correlations.append(predictive_correlation[1,0])
    mdl_val = Ridge().fit(x_val, y_val)
    yhat_val = mdl.predict(x_val)
    baseline_profits.append(baseline(y_val))
    predictive_correlation_val = np.corrcoef(yhat_val, y_val)
    validations_correlations.append(predictive_correlation_val[1,0])
    profit.append(validation(mdl,x_val, y_val, threshold))

In [18]:
np.percentile(yhat,10)

-0.001883820237290244

In [19]:
training_correlations

[0.2537294228762245,
 0.224573138745675,
 0.2300496052095839,
 0.23613228048066873,
 0.23224889138305954]

In [20]:
sum(profit)/len(profit)

115.26626447696921

In [21]:
validations_correlations

[0.013653639505625058,
 0.17299048765477906,
 0.06974306527386948,
 0.15012758564700382,
 0.1377407306094663]

In [22]:
sum(baseline_profits)/len(baseline_profits)

116.66678651147879

In [23]:
x_trn.shape

(1402, 35)