In [None]:
!pip install ta
!pip install tensorflow-gpu==1.14

In [None]:
import os, random, ta, time, gc
import pandas as pd
import numpy as np
from collections import deque
import warnings
warnings.filterwarnings('once')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from keras.callbacks.callbacks import CSVLogger, ModelCheckpoint
from keras.callbacks.tensorboard_v1 import TensorBoard
from keras.optimizers import Adam
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, CuDNNLSTM, CuDNNGRU, BatchNormalization, RNN
from keras.layers import Conv1D, TimeDistributed, MaxPooling1D, Flatten
import keras.backend as K

from hyperopt import fmin, tpe, hp, Trials

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

In [None]:
class Data():
    @staticmethod
    def get(tickers, to_pred):
        dir_path = '../input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks/'
        data = []
        for ticker in tickers:
            try:
                df = pd.read_csv(dir_path + ticker)
            except:
                continue
            if df.size < 1000:
                continue
            df.drop(['OpenInt'], axis=1, inplace=True)
            df.set_index('Date', inplace=True)
            df.sort_index(inplace=True)
            df.replace({0:np.nan}, inplace=True)
            df.fillna(method="ffill", inplace=True)
            df.dropna(inplace=True)
            classify = lambda current, future: 1 if float(future) > float(current) else 0
            df['Target'] = list(map(classify, df['Close'], df['Close'].shift(-to_pred)))
            data.append(df)
        return data
    
    @staticmethod
    def preprocess(tickers, seq_len, to_pred, features=1, start=1, step=1):
        X = []
        y = []
        chooseFeatures = {
            1: Data.features1,
            2: Data.features2,
            3: Data.features3,
            4: Data.features4,
            5: Data.features5,
            6: Data.features6,
        }
        print(chooseFeatures[features].__doc__)
        print('---------------------')
        for ticker in tickers:
            df = Data.get([ticker], to_pred)
            if not df:
                continue
            df = chooseFeatures[features](df[0])
            sequential_data = Data.create_seq(df, seq_len, start=start, step=step)
            sequential_data = Data.normalize_movement(sequential_data)
            # Append sequences and targets of df
            for seq, target in sequential_data:
                X.append(seq)
                y.append(target)
        return np.array(X), np.array(y)
    
    @staticmethod
    def features1(df_org):
        '''Percentage change features'''
        df = df_org.copy()
        # Normalize and Scale data
        for col in df.columns:
            if col != "Target":
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
                df[col] = preprocessing.scale(df[col].values)
        df.dropna(inplace=True)
        return df
    
    @staticmethod
    def features2(df_org):
        '''Close and Volume pct.change features'''
        df = pd.DataFrame()
        df['Close'] = df_org['Close']
        df['Volume'] = df_org['Volume']
        df['Target'] = df_org['Target']
        # Normalize and Scale data
        for col in df.columns:
            if col != "Target":
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
                df[col] = preprocessing.scale(df[col].values)
        df.dropna(inplace=True)
        return df

    @staticmethod
    def features3(df_org):
        '''Features without percentage change'''
        df = df_org.copy()
        # Normalize and Scale data
        for col in df.columns:
            if col != "Target":
                df.dropna(inplace=True)
                df[col] = preprocessing.scale(df[col].values)
        df.dropna(inplace=True)
        return df

    @staticmethod
    def features4(df_org, n_components=16):
        '''Denoized pct change features'''
        df = df_org.copy()
        # Normalize and Scale data
        for col in df.columns:
            if col!='Target' and col!='Volume':
                df[col] = df[col].ewm(span=10).mean()
            if col != 'Target':
                df[col] = df[col].pct_change()
                df.dropna(inplace=True)
                df[col] = preprocessing.scale(df[col].values)
        df.dropna(inplace=True)
        return df

    @staticmethod
    def features5(df_org):
        '''Main technical features'''
        df = pd.DataFrame(index=df_org.index)
        df['SMA'] = df_org['Close'].rolling(10).mean()
        df['EWMA'] = df_org['Close'].ewm(span=10).mean()
        df['%K'] = ta.momentum.stoch(high=df_org['High'], low=df_org['Low'], close=df_org['Close'])
        df['%D'] = ta.momentum.stoch_signal(high=df_org['High'], low=df_org['Low'], close=df_org['Close'])
        df['%R'] = ta.momentum.wr(high=df_org['High'], low=df_org['Low'], close=df_org['Close'])
        df['MACD'] = ta.trend.macd(close=df_org['Close'])
        df['RSI'] = ta.momentum.rsi(close=df_org['Close'])
        df['CCI'] = ta.trend.cci(high=df_org['High'], low=df_org['Low'], close=df_org['Close'])
        df['ADI'] = ta.volume.acc_dist_index(high=df_org['High'], low=df_org['Low'], 
                                             close=df_org['Close'], volume=df_org['Volume'])
        df['MOM'] = df_org['Close'].diff(periods=9)
        df['Target'] = df_org['Target']
        df.dropna(inplace=True)
        
        # Normalize and Scale data
        for col in df.columns:
            if col != "Target":
                df[col] = preprocessing.scale(df[col].values)
        df.dropna(inplace=True)
        return df
    
    @staticmethod
    def features6(df_org, n_components=16):
        '''All technical features compressed with PCA'''
        # Create technical fetures
        df = ta.add_all_ta_features(df_org.copy(), "Open", "High", "Low", "Close", "Volume", fillna=True)
        df = df.reset_index().drop(['Date'], axis=1)
        # Separate out the features
        X_tmp = df.drop(['Target'], axis=1)
        # Standarize the features
        X_tmp = StandardScaler().fit_transform(X_tmp)
        # Apply PCA
        pca = PCA(n_components=n_components)
        principalComponents = pca.fit_transform(X_tmp)
        principalDf = pd.DataFrame(data = principalComponents)
        df = pd.concat([principalDf, df['Target']], axis = 1)
        return df
    
    @staticmethod
    def create_seq(df, seq_len, start=0, step=1):
        '''Create sequences'''
        sequential_data = []
        if step < 6:
            prev_days = deque(maxlen=seq_len)
            for val in df.values:  
                prev_days.append(val[:-1])
                if len(prev_days) == seq_len:  
                    sequential_data.append([np.array(prev_days), val[-1]])
            if step > 1:
                sequential_data = sequential_data[start::step]
        else:
            for i in range(start, df.shape[0]-seq_len, step):
                sequential_data.append([
                    df.iloc[i:i+seq_len].values[:,:-1], 
                    float(df['Target'].iloc[i+seq_len-1])
                ])
        random.shuffle(sequential_data)
        return sequential_data
    
    @staticmethod
    def normalize_movement(sequential_data):
        '''Normalize distribiution of buys/sells'''
        buys = []
        sells = []
        for seq, target in sequential_data: 
            if target == 0:
                sells.append([seq, target])  
            elif target == 1:
                buys.append([seq, target])
        random.shuffle(buys)
        random.shuffle(sells)
        
        lower = min(len(buys), len(sells))
        buys = buys[:lower]
        sells = sells[:lower]
        sequential_data = buys + sells
        random.shuffle(sequential_data)
        return sequential_data
    
    @staticmethod
    def split(X, y, test_size=0.2):
        testing = int(test_size * len(X))
        X_train = X[:-testing]
        y_train = y[:-testing]
        X_test = X[-testing:]
        y_test = y[-testing:]
        return X_train, y_train, X_test, y_test

In [None]:
class Model_LSTM:
    def __init__(self, name='model'):
        self.name = name 
        self.csv_logger = CSVLogger(self.name+'.csv', separator=',', append=True)
        self.checkpoint = ModelCheckpoint(
            self.name+'.hdf5',
            monitor='val_acc',
            verbose=0,
            save_best_only=True,
            mode='max'
        )
        
    def build(self, input_shape, lr=3e-5, n_RN=128, n_DN=32, dropoutR=0.2, dropoutD=0.2):
        model = Sequential()
        model.add(CuDNNLSTM(n_RN, input_shape=(input_shape), return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization()) 

        model.add(CuDNNLSTM(n_RN, return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())
        
        model.add(CuDNNLSTM(n_RN))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())

        model.add(Dense(n_DN, activation='relu'))
        model.add(Dropout(dropoutD))

        model.add(Dense(2, activation='softmax'))
        
        # Compile model
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(lr=lr, decay=lr*1e-3),
            metrics=['acc'],
        )
        self.model = model
        
    def train(self, X, y, batch_size=32 ,epochs=1, verbose=0):
        self.model.fit(
            X, y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=verbose,
            callbacks=[self.csv_logger, self.checkpoint],
        )
    
    def train_test(self, X_train, y_train, X_val, y_val, batch_size=32 ,epochs=1 ,verbose=2, visualize=False):
        history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(X_val, y_val),
            verbose=2,
            callbacks=[self.csv_logger, self.checkpoint],
        )
        
        if visualize:
            fig, axarr = plt.subplots(1, 2, figsize=(16, 8))
            # Plot training & validation accuracy values
            axarr[0].plot(history.history['acc'])
            axarr[0].plot(history.history['val_acc'])
            axarr[0].set_title('Model accuracy')
            axarr[0].set_ylabel('Accuracy')
            axarr[0].set_xlabel('Epoch')
            axarr[0].legend(['Train', 'Test'], loc='upper left')

            # Plot training & validation loss values
            axarr[1].plot(history.history['loss'])
            axarr[1].plot(history.history['val_loss'])
            axarr[1].set_title('Model loss')
            axarr[1].set_ylabel('Loss')
            axarr[1].set_xlabel('Epoch')
            axarr[1].legend(['Train', 'Test'], loc='upper left')
            
            plt.show()

    def evaluate(self, X, y, is_print=True):
        score = {}
        eval_score = self.model.evaluate(X, y, verbose=0)
        score['loss'] = eval_score[0]
        score['acc'] = eval_score[1]
        if is_print:
            print(score)
        return score
    
    def save(self, name):
        self.model.save(name)
        
    def load(self, name):
        self.model = load_model(name)
        
        
class Model_GRU(Model_LSTM):   
    def build(self, input_shape, lr=3e-5, n_RN=128, n_DN=32, dropoutR=0.2, dropoutD=0.2):
        model = Sequential()
        model.add(CuDNNGRU(n_RN, input_shape=(input_shape), return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization()) 

        model.add(CuDNNGRU(n_RN, return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())
        
        model.add(CuDNNGRU(n_RN))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())

        model.add(Dense(n_DN, activation='relu'))
        model.add(Dropout(dropoutD))

        model.add(Dense(2, activation='softmax'))
        
        # Compile model
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(lr=lr, decay=lr*1e-3),
            metrics=['acc'],
        )
        self.model = model
        
        
class Model_RNN(Model_LSTM):   
    def build(self, input_shape, lr=3e-5, n_RN=128, n_DN=32, dropoutR=0.2,dropoutD=0.2):
        model = Sequential()
        model.add(CuDNNRNN(n_RN, input_shape=(input_shape), return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization()) 

        model.add(CuDNNRNN(n_RN, return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())
        
        model.add(CuDNNRNN(n_RN))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())

        model.add(Dense(n_DN, activation='relu'))
        model.add(Dropout(dropoutD))

        model.add(Dense(2, activation='softmax'))
        
        # Compile model
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(lr=lr, decay=lr*1e-3),
            metrics=['acc'],
        )
        self.model = model
        
class Model_CLSTM(Model_LSTM):   
    def build(self, input_shape, lr=8e-5, n_F=64, kS=3, pS=2, n_RN=220, n_DN=120, dropoutC=0.3, dropoutR=0.3, dropoutD=0.2):
        model = Sequential()
        model.add(Conv1D(filters=n_F, kernel_size=kS, activation='relu', data_format="channels_last", input_shape=(input_shape)))
        model.add(Dropout(dropoutC))
        model.add(MaxPooling1D(pool_size=pS))
        
        model.add(Conv1D(filters=n_F, kernel_size=kS, activation='relu'))
        model.add(Dropout(dropoutC))
        model.add(MaxPooling1D(pool_size=pS))
        
        model.add(CuDNNLSTM(n_RN, return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization()) 

        model.add(CuDNNLSTM(n_RN, return_sequences=True))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())
        
        model.add(CuDNNLSTM(n_RN))
        model.add(Dropout(dropoutR))
        model.add(BatchNormalization())

        model.add(Dense(n_DN, activation='relu'))
        model.add(Dropout(dropoutD))

        model.add(Dense(2, activation='softmax'))
        
        # Compile model
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(lr=lr, decay=lr*1e-3),
            metrics=['acc'],
        )
        self.model = model

In [None]:
def hyperparameter_tuning():
    def objective(space):
        model = Model_LSTM(name='name')
        model.build(
            input_shape=input_shape,
            lr=space['lr'],
            n_RN=int(space['n_RN']),
            n_DN=int(space['n_DN']),
            dropoutR=space['dropoutR'],
            dropoutD=space['dropoutD'],
        )
        model.train(
            X,
            y,
            batch_size=int(space['batch_size']),
            epochs=1,
        )
        score = model.evaluate(X_val, y_val)
        return score['loss']

    np.random.seed(0)
    tickets = os.listdir("../input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks")
    random.shuffle(tickets)
    train_tickets = tickets[:-300]
    validation_tickets = tickets[-300:]
    data = Data.get(train_tickets[:300], to_pred=3)
    val_data = Data.get(validation_tickets, to_pred=3)
    data[0].head()

    X, y = Data.preprocess(
        data, 
        seq_len=60,
        features=1
    )
    X_val, y_val = Data.preprocess(
        val_data, 
        seq_len=60,
        features=1
    )
    input_shape = X.shape[1:]

    space = {
        'lr': hp.uniform('lr', 1e-6, 1e-4),
        'n_RN': hp.quniform('N_RN', 128, 256, 1),
        'n_DN': hp.quniform('n_DN', 32, 256, 1),
        'dropoutR': hp.uniform('dropoutR', 0.1, 0.5),
        'dropoutD': hp.uniform('dropoutD', 0.1, 0.5),
        'batch_size': hp.quniform('batch_size', 32, 64, 1),
    }

    trials = Trials()
    best = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=40,
        trials=trials,
    )

    print('----------BestValues----------')
    for key, val in best.items():
        print(f'{key}: {val}')
    print('-----------------------------')

    # Create empty tpe_results dictionary
    tpe_results = {'loss': []}
    for key in trials.trials[0]['misc']['vals']:
        tpe_results[key] = []
    # Fill tpe_results
    for trial in trials.trials:
        tpe_results['loss'].append(trial['result']['loss'])
        for key, val in trial['misc']['vals'].items():
            tpe_results[key].append(val[0])

    tpe_results_df=pd.DataFrame(tpe_results)
    tpe_results_df.to_csv('tpe_results.csv', index=False)
    tpe_results_df.plot(subplots=True, figsize=(10, 10))

In [None]:
def test(train_data, validation_data, params, features=1, model='LSTM'):
    X_train, y_train = Data.preprocess(train_data, 
                           seq_len=params['seq_len'],
                           features=features)
    X_validate, y_validate = Data.preprocess(validation_data, 
                                             seq_len=params['seq_len'],
                                             features=features)
    input_shape = X_train.shape[1:]
   
    model_name = 'LSTM'
    Model = Model_LSTM
    if model == 'GRU':
        model_name = 'GRU'
        Model = Model_GRU
    if model == 'RNN':
        model_name = 'RNN'
        Model = Model_RNN
    if model == 'CLSTM':
        model_name == 'CLSTM'
        Model = Model_CLSTM
    print(model_name)
    print(Model)
    name = f"{model_name}-f{features}-sl{params['seq_len']}-tp{params['to_pred']}"
    model = Model(name=name)
    model.build(
        input_shape=input_shape,
        lr=params['lr'],
        n_RN=params['n_RN'],
        n_DN=params['n_DN'],
        dropoutR=params['dropoutR'],
        dropoutD=params['dropoutD'],
    )
    model.train_test(
        X_train,
        y_train,
        X_validate,
        y_validate,
        batch_size=params['batch_size'],
        epochs=params['epochs'],
        visualize=True,
    )

In [None]:
def repetitiveness_test():
    # params = {
    #     'seq_len': 60,
    #     'to_pred': 3,
    #     'batch_size': 45,
    #     'epochs': 10,
    #     'n_RN': 152,
    #     'n_DN': 231,
    #     'dropoutR': 0.17,
    #     'dropoutD': 0.39,
    #     'lr': 4.4e-5,
    # }
    params = {
        'seq_len': 60,
        'to_pred': 3,
        'batch_size': 32,
        'epochs': 10,
        'n_RN': 128,
        'n_DN': 32,
        'dropoutR': 0.3,
        'dropoutD': 0.2,
        'lr': 4e-5,
    }
    random.seed(0)
    tickets = os.listdir("../input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks")
    random.shuffle(tickets)
    random.shuffle(tickets)
    train_tickets = tickets[:-300]
    random.shuffle(train_tickets)
    data = Data.get(train_tickets[:500], to_pred=params['to_pred'])
    X, y = Data.preprocess(
        data, 
        seq_len=params['seq_len'],
        features=1,
    )
    val_tickets = tickets[-300:]
    val_data = Data.get(val_tickets, to_pred=params['to_pred'])
    X_val, y_val = Data.preprocess(
        val_data, 
        seq_len=params['seq_len'],
        features=1,
    )
    input_shape = X.shape[1:]
    for i in range(5):
        random.seed(int(time.time()))
        model = Model_LSTM()
        model.build(
            input_shape=input_shape,
            lr=params['lr'],
            n_RN=params['n_RN'],
            dropoutR=params['dropoutR'],
            n_DN=params['n_DN'],
            dropoutD=params['dropoutD'],
        )
        model.train_test(
            X,
            y,
            X_val,
            y_val,
            batch_size=params['batch_size'],
            epochs=params['epochs'],
            visualize=True,
        )

In [None]:
def preprocessing_methods_test():
    params = {
        'seq_len': 60,
        'to_pred': 3,
        'batch_size': 45,
        'epochs': 3,
        'n_RN': 152,
        'n_DN': 231,
        'dropoutR': 0.17,
        'dropoutD': 0.39,
        'lr': 4.4e-5,
    }

    np.random.seed(10)
    tickers = os.listdir("../input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks")
    random.shuffle(tickers)
    train_tickers = tickers[:-300]
    random.shuffle(train_tickers)
    data = Data.get(train_tickers[:500], to_pred=params['to_pred'])
    val_tickers = tickers[-300:]
    val_data = Data.get(val_tickers, to_pred=params['to_pred'])

    for features in range(1, 4):
        test(data, val_data, params, features=features)

In [None]:
def horizon_test():
    params = {
        'seq_len': 60,
        'to_pred': 3,
        'batch_size': 45,
        'epochs': 10,
        'n_RN': 152,
        'n_DN': 231,
        'dropoutR': 0.17,
        'dropoutD': 0.39,
        'lr': 4.4e-5,
    }

    np.random.seed(0)
    tickers = os.listdir("../input/price-volume-data-for-all-us-stocks-etfs/Data/Stocks")
    random.shuffle(tickers)
    random.shuffle(tickers)
    train_tickers = tickers[:500]
    random.shuffle(train_tickers)
    val_tickers = tickers[-300:]

    # test seq_len in [20, 60, 120]
    params['seq_len'] = 120
    for to_pred in [10, 15]:
        params['to_pred'] = to_pred
        train_data = Data.get(train_tickers, to_pred=params['to_pred'])
        val_data = Data.get(val_tickers, to_pred=params['to_pred'])
        print(f"Sequence Length: {params['seq_len']}")
        print(f"To Predict: {params['to_pred']}")
        test(train_data, val_data, params)

In [None]:
def train_final_model():
    X, y = [], []
    step = 10
    starts = [i for i in range(step)]
    for lr in [1e-6, 1e-6]:
        i = 0
        random.shuffle(starts)
        K.set_value(model.model.optimizer.lr, lr)
        for start in starts:
            i += 1
            print('Iter: ', i)
            print('Day: ', start)
            del X, y
            gc.collect()
            X, y = Data.preprocess(
                train_tickers,
                seq_len=params['seq_len'],
                to_pred=params['to_pred'],
                features=1,
                start=start,
                step=step
            )
            model.train_test(
                X,
                y,
                X_val,
                y_val,
                batch_size=params['batch_size'],
                epochs=params['epochs'],
            )