In [1]:
# Imports

import math
import numpy as np
from matplotlib import pyplot as plt
from numpy import random as rnd
import warnings,datetime,os,calendar,csv,time

import tensorflow as tf
import pandas as pd
import seaborn as sns

from keras.models import Model,Sequential
from keras.layers import Dense,LSTM,Conv2D,Dropout,BatchNormalization,Input,Concatenate,Add,Activation,MaxPooling2D,AveragePooling2D,Flatten
import keras.backend as K

from sklearn import preprocessing as pp

from sklearn.cluster import KMeans,MeanShift
from sklearn.dummy import DummyClassifier,DummyRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier,AdaBoostRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.linear_model import BayesianRidge,Lasso,LinearRegression,SGDClassifier,SGDRegressor
from sklearn.mixture import BayesianGaussianMixture,GaussianMixture
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor,RadiusNeighborsClassifier,RadiusNeighborsRegressor,NearestNeighbors
from sklearn.manifold import Isomap,TSNE
from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.neural_network import BernoulliRBM,MLPClassifier,MLPRegressor
from sklearn.decomposition import FactorAnalysis,KernelPCA,PCA,MiniBatchSparsePCA,FastICA
from sklearn.preprocessing import CategoricalEncoder,KBinsDiscretizer,LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler

from xgboost import XGBClassifier,XGBRegressor

import pickle,h5py,json

import pandas_datareader as pdr
import urllib
from bs4 import BeautifulSoup
import iexfinance as iex
from iexfinance.stocks import get_historical_data
from scipy.signal import resample,correlate
from scipy import fftpack

warnings.filterwarnings('ignore')
sns.set()
plt.xkcd()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


<matplotlib.rc_context at 0x1732473d8d0>

In [2]:
start_date = datetime.datetime(2015,1,1)
end_date = datetime.datetime(2017,1,1)

with open('Documents\stock_symb.txt','r') as f:
    lines = f.readlines()
content = [line.strip() for line in lines]
tickers = [content[i] for i in list(np.random.permutation(np.arange(len(content))))]

In [3]:
ticker_set = tickers[:50]
print_progress = 10

DATA = []

ctr = 0
for ticker in ticker_set:

    if ctr%print_progress==0: print('Processing %d/%d'%(ctr,len(ticker_set)))
    ctr += 1
    try:
        df = get_historical_data(ticker,start=start_date,end=end_date,output_format='pandas')
    except:
        continue
    data = df['open'].values
    DATA.append(data)

Processing 0/50
Processing 10/50
Processing 20/50
Processing 30/50
Processing 40/50


In [4]:
# First project: Fill in NaNs

In [5]:
datalen = [len(data) for data in DATA]
most_common_length = max(set(datalen),key=datalen.count)

In [6]:
xtrain = [data for data in DATA if np.sum(np.isnan(data))==0 and len(data)==most_common_length]
xuse = [data for data in DATA if np.sum(np.isnan(data))!=0]

In [7]:
lookback = 50

S = []
for x in xtrain:
    for k in range(len(x)-lookback):
        S.append(x[k:k+lookback])

In [8]:
perc_nans = 0.2

X = []
Y = []

for s in S:
    num_nans = round(perc_nans*len(s))
    nan_indices = rnd.permutation(np.arange(0,len(s)))[:num_nans]
    ss = s.copy()
    ss[nan_indices]=-1
    X.append(ss)
    Y.append(s)
X = np.stack(X,axis=0)
Y = np.stack(Y,axis=0)

In [9]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.2,shuffle=True)

In [10]:
Xtrain[0]

array([47.5246, 47.534 , 48.2714, 48.5645, -1.    , 49.364 , 48.1669,
       46.866 , 47.5399, -1.    , 49.535 , -1.    , 49.402 , 48.2334,
       49.3925, 49.573 , 50.3045, 50.732 , 50.7415, 51.1311, 51.2165,
       51.0361, 51.3211, 51.2736, 51.3251, 51.3211, 51.2054, 51.3306,
       -1.    , 51.7514, 51.8417, -1.    , -1.    , -1.    , 51.8341,
       52.1571, 52.7081, 53.0121, 53.4872, 53.5561, 53.6637, 54.1522,
       54.1712, 53.6772, -1.    , 53.7817, -1.    , 53.6962, 53.5917,
       -1.    ])

In [11]:
Ytrain[0]

array([47.5246, 47.534 , 48.2714, 48.5645, 48.8225, 49.364 , 48.1669,
       46.866 , 47.5399, 48.6325, 49.535 , 50.1335, 49.402 , 48.2334,
       49.3925, 49.573 , 50.3045, 50.732 , 50.7415, 51.1311, 51.2165,
       51.0361, 51.3211, 51.2736, 51.3251, 51.3211, 51.2054, 51.3306,
       51.6821, 51.7514, 51.8417, 52.0431, 52.0431, 51.4826, 51.8341,
       52.1571, 52.7081, 53.0121, 53.4872, 53.5561, 53.6637, 54.1522,
       54.1712, 53.6772, 53.9812, 53.7817, 53.7342, 53.6962, 53.5917,
       53.3447])

In [13]:
enc_in = Input(shape=(Xtrain.shape[1],1,))
encoder = LSTM(units=15,activation=tf.keras.activations.relu,return_state=True)
[enc_out,states_h,states_c] = encoder(enc_in)
states = [states_h,states_c]

dec_in = Input(shape=(Xtrain.shape[1],1,))
decoder = LSTM(units=15,activation=tf.keras.activations.relu,return_state=True,return_sequences=True)
dec_out,_,_ = decoder(dec_in,initial_state=states)
dec_dense = Dense(units=Ytrain.shape[1],activation=tf.nn.softmax)
decdense_out = dec_dense(dec_out)

model = Model([enc_in,dec_in],decdense_out)

model.compile(optimizer='rmsprop',loss='mse',metrics=['accuracy'])