In [25]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
plt.style.use('fivethirtyeight')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam

import optuna
from optuna.samplers import RandomSampler


from pandas_datareader import data as pdr
import yfinance as yfin
import datetime as dt

# Yahoo API may have broken previous versions of pd_datareader, so this is a workaround.
yfin.pdr_override()
pd.options.mode.chained_assignment = None  # default='warn'

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# Yahoo Finance stock scraping.
# **Careful with how many times you run this to avoid IP ban**
TICKER = 'GOOG'
START = dt.datetime(2015, 1, 1)
END = dt.datetime.today()

stock = pdr.get_data_yahoo(TICKER, START, END).rename(columns= {'Adj Close': 'AdjClose'})

print(stock.shape[0], "unique points loaded with attributes: \n", stock.
      keys())

def series_to_supervised(data, n_in=5, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
    data: Sequence of observations as a list or NumPy array.
    n_in: Number of lag observations as input (X).
    n_out: Number of observations as output (y).
    dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
    Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    if i == 0:
        names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
    else:
        names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
        
    return agg

[*********************100%***********************]  1 of 1 completed
2065 unique points loaded with attributes: 
 Index(['Open', 'High', 'Low', 'Close', 'AdjClose', 'Volume'], dtype='object')


In [27]:
# Specify number of days to use for beta calculation (255 = 1yr).
window = 252 

# Specify a market highly correlated with 'stock'.
market_ticker = 'SPY'

def beta(df, market=None):
    # If the market values are not passed,
    # I'll assume they are located in a column
    # named 'Market'.  If not, this will fail.
    if market is None:
        market = df['MarketClose']
        df = df.drop('MarketClose', axis=1)
    X = market.values.reshape(-1, 1)
    X = np.concatenate([np.ones_like(X), X], axis=1)
    b = np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(df.values)
    return float(b[1])

def roll(df, w=252):
    # Takes 'w'-sized slices from dataframe, incrementing 1 entry at a time.
    for i in range(df.shape[0] - w + 1):
        yield pd.DataFrame(df.values[i:i+w, :], df.index[i:i+w],
                           df.columns)


#### Combining stock + market data and computing.
market = pdr.get_data_yahoo(market_ticker,
                            START,
                            END).rename(columns={'Adj Close': 'MarketClose'})

betas = np.array([])
dat = pd.concat([stock.AdjClose, market.MarketClose], axis=1)
len(dat)

for  i, sdf in enumerate(roll(dat.pct_change().dropna(), window)):
    betas = np.append(betas, beta(sdf))

datFull = dat.drop(index=dat.index[:window], axis=0, inplace=False)
datFull['Beta'] = betas.tolist()

[*********************100%***********************]  1 of 1 completed


In [28]:
LAG = 60 # Number of days to use for predicting the following day(s).
DAYS = 1 # Number of days to predict with each lag period.
TRAIN_RATIO = 0.70


# Selecting 'AdjClose' prices as input and target feature for time series.
data = dat.filter(['AdjClose']).values

# Scaling data. Ensures quicker convergence to solution.
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data)

# Splitting input features and target object, X and y.
supervised_data = series_to_supervised(scaled_data, n_in=LAG, n_out=DAYS)
y = supervised_data['var1(t)'] # Isolating target object.
X = supervised_data.loc[:, supervised_data.columns != 'var1(t)'] 

# Selecting converted data for train-test split.
len_training = int(np.ceil(len(scaled_data) * TRAIN_RATIO))

X_train = X.iloc[0:len_training].to_numpy()
y_train = y.iloc[0:len_training].to_numpy()
# X_train, y_train = np.array(X_train), np.array(y_train)

# We subtract lag since we need the lag days to actually make test predictions.
X_test = X.iloc[len_training-60:].to_numpy()
y_test = data[len_training:]

# Reshaping to obtain 3D reps (currently 2d) to pass into LSTM.
# LSTM expects d1 # of samples, d2 # of timesteps, and d3 # of features.
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

if len(X_test) != len(y_test):
    raise Warning('X_test, y_test length mismatch.')

# generator = TimeseriesGenerator(scaled_data, scaled_data, length=60, batch_size=1)

In [69]:
data1 = dat = datImport(TICKER, start = START, end=END, verbose=False)
X_train1, y_train1, X_test1, y_test1, scaler1 = data_split(data1, LAG, DAYS, TRAIN_RATIO)

[*********************100%***********************]  1 of 1 completed


In [81]:
len(X_train), len(X_train1)

(1446, 1446)

In [24]:
def datImport(ticker, start=dt.datetime(2023,1,1), end=dt.datetime.today(), verbose=False):
    # importing daily equity or market data.
    data = pdr.get_data_yahoo(ticker, start, end).rename(columns= {'Adj Close': 'AdjClose'})

    if verbose:
        print(data.shape[0], "days loaded with attributes: \n", data.keys())

        fig, ax = plt.subplots(1,1, dpi=300, figsize=(16,8),
            constrained_layout=False)

        ax.plot(data.index, data.AdjClose)

        ax.set_title("Adjusted Closing Prices for %s (USD), %s-%s" 
                    %(ticker, start.year, end.year))
        ax.set_xlabel('Date', fontsize=18)
        ax.set_ylabel('Adjusted Closing Price (USD)', fontsize=18)

        # Set major and minor date tick locators
        maj_loc = mdates.MonthLocator(bymonth=np.arange(1,12,6))
        ax.xaxis.set_major_locator(maj_loc)
        min_loc = mdates.MonthLocator()
        ax.xaxis.set_minor_locator(min_loc)

        # Set major date tick formatter
        zfmts = ['', '%b\n%Y', '%b', '%b-%d', '%H:%M', '%H:%M']
        maj_fmt = mdates.ConciseDateFormatter(maj_loc, zero_formats=zfmts, 
                                            show_offset=False)
        ax.xaxis.set_major_formatter(maj_fmt)

        ax.figure.autofmt_xdate(rotation=0, ha='center')
        ax.set_xlim(data.index.min(), data.index.max());

    return data

In [50]:
START = dt.datetime(2015, 1, 1)
END = dt.datetime.today()
TICKER = 'GOOG'

dat = datImport(TICKER, start = START, end=END, verbose=False)

# data = pdr.get_data_yahoo(TICKER, START, END) #.rename(columns= {'Adj Close': 'AdjClose'})
# data

[*********************100%***********************]  1 of 1 completed


In [42]:
LAG = 60 # Number of days to use for predicting the following day(s).
DAYS = 1 # Number of days to predict with each lag period.
TRAIN_RATIO = 0.70

def data_split(data, lag=60, days=1, train_ratio=0.70):
    """
    Prepping stock data for neural net; scaling down 
    values and making train-test split.
    data: DataFrame, all stock data.
    lag: int, number of days used for prediction.
    days: int, number of days to predict.
    train_ratio: float, percentage of data for training.
    Returns
        X_train: array, independent training features.
        y_train: array, objective training feature.
        X_test: array, independent test features.
        y_test: array, objective test feature.
    """
    # Selecting 'AdjClose' prices as input and target feature for time series.
    data_adj = data.filter(['AdjClose']).values

    # Scaling data. Ensures quicker convergence to solution.
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(data_adj)

    # Splitting input features and target object, X and y.
    supervised_data = series_to_supervised(scaled_data, n_in=lag, n_out=days)
    y = supervised_data['var1(t)'] # Isolating target object.
    X = supervised_data.loc[:, supervised_data.columns != 'var1(t)'] 

    # Selecting converted data for train-test split.
    len_training = int(np.ceil(len(scaled_data) * train_ratio))

    X_train = X.iloc[0:len_training].to_numpy()
    y_train = y.iloc[0:len_training].to_numpy()
    # X_train, y_train = np.array(X_train), np.array(y_train)

    # We subtract lag since we need the lag days to actually make test predictions.
    X_test = X.iloc[len_training-60:].to_numpy()
    y_test = data_adj[len_training:]

    # Reshaping to obtain 3D reps (currently 2d) to pass into LSTM.
    # LSTM expects d1 # of samples, d2 # of timesteps, and d3 # of features.
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

    if len(X_test) != len(y_test):
        raise Warning('X_test, y_test length mismatch.')
    
    return X_train, y_train, X_test, y_test, scaler

In [44]:
split = data_split(dat)
split

(array([[[0.01275146],
         [0.0084272 ],
         [0.00371951],
         ...,
         [0.02475193],
         [0.02205221],
         [0.02351077]],
 
        [[0.0084272 ],
         [0.00371951],
         [0.00337958],
         ...,
         [0.02205221],
         [0.02351077],
         [0.02191782]],
 
        [[0.00371951],
         [0.00337958],
         [0.00400411],
         ...,
         [0.02351077],
         [0.02191782],
         [0.01976754]],
 
        ...,
 
        [[0.37802882],
         [0.38578154],
         [0.38768802],
         ...,
         [0.50408618],
         [0.49810115],
         [0.4914067 ]],
 
        [[0.38578154],
         [0.38768802],
         [0.38779507],
         ...,
         [0.49810115],
         [0.4914067 ],
         [0.49472024]],
 
        [[0.38768802],
         [0.38779507],
         [0.39591641],
         ...,
         [0.4914067 ],
         [0.49472024],
         [0.48843009]]]),
 array([0.02191782, 0.01976754, 0.01698878, ..., 0.4947

In [6]:
import datetime as dt
from base import *

LAG = 60 # Number of days to use for predicting the following day(s).
DAYS = 1 # Number of days to predict with each lag period.
TRAIN_RATIO = 0.70

TICKER = 'GOOG'
START = dt.datetime(2015, 1, 1)
END = dt.datetime.today()

In [7]:
data1 = dat = datImport(TICKER, start = START, end=END, verbose=False)
X_train1, y_train1, X_test1, y_test1, scaler1 = data_split(data1, LAG, DAYS, TRAIN_RATIO)

[*********************100%***********************]  1 of 1 completed


In [29]:
X_train

array([[[0.01275146],
        [0.0084272 ],
        [0.00371951],
        ...,
        [0.02475193],
        [0.02205221],
        [0.02351077]],

       [[0.0084272 ],
        [0.00371951],
        [0.00337958],
        ...,
        [0.02205221],
        [0.02351077],
        [0.02191782]],

       [[0.00371951],
        [0.00337958],
        [0.00400411],
        ...,
        [0.02351077],
        [0.02191782],
        [0.01976754]],

       ...,

       [[0.37802882],
        [0.38578154],
        [0.38768802],
        ...,
        [0.50408618],
        [0.49810115],
        [0.4914067 ]],

       [[0.38578154],
        [0.38768802],
        [0.38779507],
        ...,
        [0.49810115],
        [0.4914067 ],
        [0.49472024]],

       [[0.38768802],
        [0.38779507],
        [0.39591641],
        ...,
        [0.4914067 ],
        [0.49472024],
        [0.48843009]]])

In [41]:
v= [X_train1, y_train1, X_test1, y_test1, scaler1]

# v[:], np.array(X_train1)
x, y = v[0::1]

ValueError: too many values to unpack (expected 2)

In [19]:
pd.DataFrame({'Column1': Xte, 'Column2': data[:, 1]})

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [18]:
np.asarray(X_train1)

array([[[0.01275146],
        [0.0084272 ],
        [0.00371951],
        ...,
        [0.02475193],
        [0.02205221],
        [0.02351077]],

       [[0.0084272 ],
        [0.00371951],
        [0.00337958],
        ...,
        [0.02205221],
        [0.02351077],
        [0.02191782]],

       [[0.00371951],
        [0.00337958],
        [0.00400411],
        ...,
        [0.02351077],
        [0.02191782],
        [0.01976754]],

       ...,

       [[0.37802882],
        [0.38578154],
        [0.38768802],
        ...,
        [0.50408618],
        [0.49810115],
        [0.4914067 ]],

       [[0.38578154],
        [0.38768802],
        [0.38779507],
        ...,
        [0.49810115],
        [0.4914067 ],
        [0.49472024]],

       [[0.38768802],
        [0.38779507],
        [0.39591641],
        ...,
        [0.4914067 ],
        [0.49472024],
        [0.48843009]]])