In [318]:
import yfinance as yf
import pandas as pd
import datetime
import tensorflow as tf
import keras
import seaborn
import numpy as np

In [319]:
def get_stock(tickr):
    start = pd.to_datetime('2010-08-02')
    end = pd.to_datetime('2018-08-02')
    stock = [tickr]
    data = yf.download(stock, start = start, end = end)
    data.reset_index(inplace = True)
    data = data[['Open', 'High','Low', 'Close']]

    y = pd.DataFrame()

    for i in range(len(data) - 1):
        if data.loc[i + 1, 'Open'] - data.loc[i, 'Close'] > 0:
            y.loc[i, 0] = 1
        else:
            y.loc[i, 0] = 0

    data_train = data.iloc[0:1601,] 
    data_test = data.iloc[1601:2001,]

    y_train = y.iloc[0:1601,] 
    y_test = y.iloc[1601:2001,]

    return data_train, data_test, y_train, y_test

In [320]:
def preprocess_exp(data):
    # use the estimates to calculate the curve
    preprocessed = pd.DataFrame()
    PF = pd.DataFrame()
    
    for col in ['Open', 'High', 'Low', 'Close']:
        PF = np.polyfit(data.index, np.log(data.loc[:, col]), 1)
        # divide by trend
        str_col = col + 'NoExp'
        preprocessed[str_col] = data[col] / (np.exp(PF[0] * data.index + PF[1]))
        preprocessed[str_col] = (preprocessed[str_col] - np.mean(preprocessed[str_col]))/np.std(preprocessed[str_col])

    return preprocessed

In [321]:
# Create fixed-window sequences for training and validation data
def create_sequences(X, y, window_size):
    seq_X = []
    seq_y = []
    for i in range(len(X) - window_size):
        seq_X.append(X.iloc[i:i+window_size,])
        seq_y.append(y.iloc[i+window_size,])
    return np.array(seq_X), np.array(seq_y)

In [322]:
def build_model(x_train, y_train, x_validation, y_validation, window_size, options):
    # Build the FFNN model
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(window_size, 4))) 
    model.add(keras.layers.Dense(options[0][0], activation='relu'))

    if len(options[0]) > 1:
        if len(options[0]) > 2:
            for i in range(1,len(options[0])-1):
                model.add(keras.layers.Dense(options[0][i], activation=options[1]))
                
    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(x_train, y_train, epochs=options[3], batch_size=options[2], validation_data=(x_validation, y_validation), verbose = 0)

    # Make predictions
    predictions = model.predict(x_validation)

    # Evaluate the model
    loss, accuracy = model.evaluate(x_validation, y_validation)

    return accuracy, model

In [323]:
def get_accuracy(model, x_validation, y_validation):
    predictValidation = model.predict(x_validation)

    modelResults = pd.DataFrame()
    modelResults["realValues"] = pd.DataFrame(y_validation)
    modelResults["estimatedValues"] = pd.DataFrame(predictValidation)

    for i in range(len(modelResults)):
        if modelResults.loc[i, "estimatedValues"] > 0.5:
            modelResults.loc[i, "estimatedValues"] = 1
        else:
            modelResults.loc[i, "estimatedValues"] = 0

    from sklearn.metrics import confusion_matrix
    cf = confusion_matrix(np.array(modelResults["realValues"]), np.array(modelResults["estimatedValues"]))

    print(cf)

In [324]:
tickers = ['GOOG','AAPL','MSFT','AMZN','NVDA']
window_size = 5

x_train_total = []
y_train_total = []

x_validation_total = []
y_validation_total = []

for tickr in tickers:
    data_train, data_test, y_train, y_test = get_stock(tickr)
    preprocessed = preprocess_exp(data_train)
    x_train_seq, y_train_seq = create_sequences(preprocessed.iloc[0:1301,], y_train.iloc[:1301,], window_size)
    x_validation_seq, y_validation_seq = create_sequences(preprocessed.iloc[1301:,], y_train.iloc[1301:1601,], window_size)

    x_train_total.append(x_train_seq)
    y_train_total.append(y_train_seq)

    x_validation_total.append(x_validation_seq)
    y_validation_total.append(y_validation_seq)

x_train_total = np.array(x_train_total).reshape((len(tickers)*(1301-window_size)),window_size,4)
y_train_total = np.array(y_train_total).reshape((len(tickers)*(1301-window_size)),1)

x_validation_total = np.array(x_validation_total).reshape((len(tickers)*(1601-1301-window_size)),window_size,4)
y_validation_total = np.array(y_validation_total).reshape((len(tickers)*(1601-1301-window_size)),1)



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [325]:
# Print the class distribution before and after undersampling
print("Class distribution before undersampling:")
print(pd.Series(y_train_total.flatten()).value_counts())

zeros = pd.Series(y_train_total.flatten()).value_counts()[0]
ones = pd.Series(y_train_total.flatten()).value_counts()[1]

indexes = np.array(np.where(y_train_total == 1)[0])

random_numbers = np.random.choice(indexes, size=ones-zeros, replace=False)

x_train_total = np.delete(x_train_total, random_numbers, axis = 0)
y_train_total = np.delete(y_train_total, random_numbers, axis = 0)

# Print the class distribution before and after undersampling
print("Class distribution after undersampling:")
print(pd.Series(y_train_total.flatten()).value_counts())


Class distribution before undersampling:
1.0    3458
0.0    3022
dtype: int64
Class distribution after undersampling:
0.0    3022
1.0    3022
dtype: int64


In [326]:
# # Print the class distribution before and after undersampling
# print("Class distribution before undersampling:")
# print(pd.Series(y_validation_total.flatten()).value_counts())

# zeros = pd.Series(y_validation_total.flatten()).value_counts()[0]
# ones = pd.Series(y_validation_total.flatten()).value_counts()[1]

# indexes = np.array(np.where(y_validation_total == 1)[0])

# random_numbers = np.random.choice(indexes, size=ones-zeros, replace=False)

# x_validation_total = np.delete(x_validation_total, random_numbers, axis = 0)
# y_validation_total = np.delete(y_validation_total, random_numbers, axis = 0)

# # Print the class distribution before and after undersampling
# print("Class distribution after undersampling:")
# print(pd.Series(y_validation_total.flatten()).value_counts())

In [327]:
lays = [[64,64]]
epochs = [10]
batchSizes = [64]
activationFunctions = ['sigmoid']

options = []

for layer in lays:
    for activation in activationFunctions:
        for batchSize in batchSizes:
            for epoch in epochs:
                options.append([layer, activation, batchSize, epoch, 0])

for i in range(len(options)):
    ac=[]
    for j in range(10):
        a, model = build_model(x_train_total, y_train_total, x_validation_total, y_validation_total, window_size, options[i])
        ac.append(a)
    options[i][4] = np.mean(ac)


op = pd.DataFrame(options)
res = op.sort_values(4, ascending=False)
print(res.head(84))

get_accuracy(model, x_validation_total, y_validation_total)

          0        1   2   3         4
0  [64, 64]  sigmoid  64  10  0.525966
[[595  72]
 [696 112]]
