In [94]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, scale

time: 18.3 ms


In [1148]:
from pandas_datareader import data
from datetime import datetime, timedelta
from finsymbols import symbols
import numpy as np
import pandas as pd
import numba
from math import isnan
import os
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 277 ms


In [2]:
class Stock(object):
    def __init__(self, stock_dict):
        self.stock_dict = stock_dict
        self.historical_data = self.get_historical_data()
        
    @property
    def company(self):
        return self.stock_dict['company']
    
    @property
    def industry(self):
        return self.stock_dict['industry']
    
    @property
    def sector(self):
        return self.stock_dict['sector']
    
    @property
    def symbol(self):
        return self.stock_dict['symbol']
    
    def get_historical_data(self, source='google', start_date='2002-08-01', end_date='2017-08-01'):
        return data.DataReader(self.symbol, source, start_date, end_date)
    
    @property
    def dates(self):
        return self.historical_data.index

time: 15.5 ms


In [343]:
class Day(Stock):
    def __init__(self, stock_obj, date):
        self.historical_data = stock_obj.historical_data
        self.stock_dict = stock_obj.stock_dict
        self.date = date
        self.info = self.historical_data.ix[date]
    
    @property
    def open_price(self):
        return self.info['Open']
    
    @property
    def close_price(self):
        return self.info['Close']
    
    @property
    def high(self):
        return self.info['High']
    
    @property
    def low(self):
        return self.info['Low']
    
    @property
    def volume(self):
        return self.info['Volume']
    
    @property
    def price_change(self):
        return self.close_price - self.open_price
    
    @property
    def percent_change(self):
        return self.price_change / self.open_price * 100
    
    def get_past_info(self, days_back):
        info = None
        while info is None:
            try:
                info = self.historical_data.ix[self.date - timedelta(days_back)]
            except:
                pass
            days_back += 1
        return info
    
    @property
    def week_percent_change(self):
        info = self.get_past_info(-7)
        price_change = info['Close'] - self.open_price
        return price_change / self.open_price * 100
    
    @property
    def month_percent_change(self):
        info = self.get_past_info(-30)
        price_change = info['Close'] - self.open_price
        return price_change / self.open_price * 100
    
    @staticmethod
    def get_percent_change(info):
        open_price = info['Open']
        close_price = info['Close']
        price_change  = close_price - open_price
        return price_change / open_price * 100
    
    @property
    def previous_day_info(self):
        return self.get_past_info(1)
    
    @property
    def previous_day_percent_change(self):
        return self.get_percent_change(self.previous_day_info)
    
    @property
    def previous_day_volume(self):
        return self.previous_day_info['Volume']
    
    def get_past_percent_change(self, days):
        close_price = self.previous_day_info['Close']
        open_price = self.get_past_info(days)['Open']
        price_change  = close_price - open_price
        return price_change / open_price * 100
    
    @property
    def previous_week_percent_change(self):
        return self.get_past_percent_change(7)
    
    @property
    def previous_month_percent_change(self):
        return self.get_past_percent_change(30)
    
    def get_average_volume(self, days):
        total_volume = 0
        for days_back in range(days):
            total_volume += self.get_past_info(days_back+1)['Volume']
        return total_volume / days
    
    @property
    def previous_week_average_volume(self):
        return self.get_average_volume(7)
    
    @property
    def previous_month_average_volume(self):
        return self.get_average_volume(30)
    
    def get_high_or_low(self, days_back, high=True):
        high_low = None
        while high_low is None:
            try:
                if high:
                    high_low = max(self.historical_data.ix[self.date - timedelta(days_back):self.date]['Close'])
                else:
                    high_low = min(self.historical_data.ix[self.date - timedelta(days_back):self.date]['Close'])
            except:
                pass
            days_back += 1
        return high_low

time: 168 ms


In [4]:
all_stocks = symbols.get_nyse_symbols() + symbols.get_nasdaq_symbols()

time: 91.2 ms


In [6]:
stocks = []
for stock in all_stocks:
    try:
        stocks.append(Stock(stock))
    except:
        pass

time: 4h 20min 2s


In [710]:
for stock in stocks:
    stock.historical_data['BIAS6'] = (stock.historical_data['Close'].shift()-stock.historical_data['MA5'])/stock.historical_data['MA5']*100


time: 12.9 s


In [442]:
for num in range(len(stocks)):
    symbol = stocks[num].symbol
    stocks[num].historical_data.to_csv('/Users/JasonKatz/Desktop/Historical_Data/' + symbol + '_data')

time: 5min 33s


In [842]:
stocks[0].historical_data.ix[253:500, 15:]

Unnamed: 0_level_0,Stochastic_K_Month,Disparity_5,Disparity_10,Price_Oscillator,Price_Osc_J,Positive,NumPos10,AveVol10,AveVol20,Vol_20_10,log_return,ASY5,ASY4,ASY3,ASY2,ASY1,PSY12,BIAS6
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2012-05-29,90.883978,104.396728,109.385044,18.605603,0.954397,0.0,6.0,1215854.1,1329816.65,0.0,-0.000472,0.033628,0.027946,0.027977,0.010649,0.006387,58.333333,4.396728
2012-05-30,100.000000,105.210918,111.902877,19.209801,0.940199,0.0,5.0,1305750.2,1341404.80,0.0,-0.012470,0.022263,0.020865,0.006942,0.002958,-0.000472,58.333333,5.210918
2012-05-31,88.206388,101.033743,107.978529,19.572316,0.935684,0.0,5.0,1412495.8,1272338.40,1.0,-0.021475,0.014198,0.002089,-0.002185,-0.006471,-0.012470,50.000000,1.033743
2012-06-01,77.149877,98.512830,104.285641,19.631356,0.944644,0.0,5.0,1504465.4,1276610.60,1.0,-0.013402,-0.002624,-0.007007,-0.011472,-0.016972,-0.021475,41.666667,-1.487170
2012-06-04,52.579853,94.572046,98.121086,19.412170,0.963830,1.0,5.0,1535933.6,1247535.80,1.0,0.034751,-0.008286,-0.011955,-0.015782,-0.017438,-0.013402,41.666667,-5.427954
2012-06-05,68.304668,98.204597,99.964854,19.291609,0.982391,1.0,6.0,1503789.9,1232632.90,1.0,0.039550,-0.002614,-0.003149,-0.000042,0.010674,0.034751,50.000000,-1.795403
2012-06-06,85.995086,102.331349,102.356735,19.160248,0.999752,1.0,6.0,1543061.9,1241750.20,1.0,0.053685,0.005391,0.009856,0.020300,0.037150,0.039550,58.333333,2.331349
2012-06-07,100.000000,107.027186,106.686885,19.374810,1.003190,0.0,6.0,1595541.1,1266777.95,1.0,-0.067002,0.018622,0.028646,0.042662,0.046618,0.053685,66.666667,7.027186
2012-06-08,77.991453,101.464844,101.227592,19.477656,1.002344,1.0,5.0,1511849.6,1278896.20,1.0,0.001926,0.009516,0.015246,0.008745,-0.006658,-0.067002,58.333333,1.464844
2012-06-11,78.205128,100.028868,101.020408,19.793815,0.990185,0.0,5.0,1348482.2,1285686.80,1.0,-0.024634,0.012582,0.007040,-0.003797,-0.032538,0.001926,58.333333,0.028868


time: 120 ms


In [409]:
test['Close'].shift().rolling(window=10).mean()

Date
2011-05-26       NaN
2011-05-27       NaN
2011-05-31       NaN
2011-06-01       NaN
2011-06-02       NaN
2011-06-03       NaN
2011-06-06       NaN
2011-06-07       NaN
2011-06-08       NaN
2011-06-09       NaN
2011-06-10    12.484
2011-06-13    12.369
2011-06-14    12.252
2011-06-15    12.126
2011-06-16    12.054
2011-06-17    11.987
2011-06-20    11.944
2011-06-21    11.937
2011-06-22    11.994
2011-06-23    12.071
2011-06-24    12.153
2011-06-27    12.207
2011-06-28    12.217
2011-06-29    12.274
2011-06-30    12.350
2011-07-01    12.467
2011-07-05    12.634
2011-07-06    12.850
2011-07-07    13.001
2011-07-08    13.245
               ...  
2017-06-20    20.791
2017-06-21    20.846
2017-06-22    20.941
2017-06-23    21.022
2017-06-26    21.210
2017-06-27    21.345
2017-06-28    21.369
2017-06-29    21.394
2017-06-30    21.327
2017-07-03    21.095
2017-07-05    20.721
2017-07-06    20.375
2017-07-07    19.989
2017-07-10    19.563
2017-07-11    19.162
2017-07-12    18.838
2017-07-

time: 16.5 ms


In [1090]:
#data_test = pd.DataFrame()
temp = stocks[1].historical_data.ix[15:-1,-7:].values
for stock in stocks[1000:2000]:
    temp = np.vstack((temp, stock.historical_data.ix[15:-1,-7:].values))
    #data_test = pd.concat([data_test, stock.historical_data.ix[253:]])

KeyboardInterrupt: 

time: 45.7 s


In [1057]:
test = stocks[1].historical_data.ix[15:-1,-7:].values

time: 41.8 ms


In [1068]:
np.column_stack((stocks[1].historical_data.ix[15:-1,-7:].values, stocks[1].historical_data.ix[15:-1,-7:].values))

time: 12.8 ms


In [1031]:
#data_test = pd.DataFrame()
#data_test['day'] = previous_day
#data_test['week'] = previous_week
#data_test['result'] = current_day
enc = OneHotEncoder(categorical_features=np.arange(0, -1))
x_raw = enc.fit_transform(data_test[data_test.columns[-7:]].fillna(0.0))
X = scale(x_raw)
Y = np.array(data_test['Positive'].fillna(1.0))

time: 1.81 s


In [1032]:
1-np.mean(Y)

-inf

time: 16.2 ms


In [847]:
accuracies = []
for i in range(5):
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y)
    ytrain = np.ravel(ytrain)
    clf = MLPClassifier()
    clf.fit(xtrain, ytrain)
    ypred_test = clf.predict(xtest)
    accuracies.append(accuracy_score(ytest, ypred_test))
np.mean(accuracies)

0.51624945103205966

time: 33 s


In [374]:
previous_day = []
previous_week = []
current_day = []
for stock in stocks[54:55]:
    for date in stock.dates[100:-100]:
        day = Day(stock, date)
        previous_day_loop = day.previous_day_percent_change
        previous_week_loop = day.previous_week_percent_change
        current = day.week_percent_change
        if not isnan(previous_day_loop) and not isnan(previous_week_loop) and not isnan(current):
            previous_day.append(previous_day_loop)
            previous_week.append(previous_week_loop)
            if current > 0:
                current_day.append(1)
            else:
                current_day.append(0)

time: 4.06 s


In [1033]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

time: 398 ms


In [1034]:
for idx, y in enumerate(ytrain):
    if y != 1 and y != 0:
        ytrain[idx] = 1.0

time: 506 ms


In [1035]:
bb = np.zeros((len(ytrain), 2))
bb[np.arange(len(ytrain)), ytrain.astype(int)] = 1
bb.shape

(821895, 2)

time: 48.5 ms


In [1036]:
for idx, y in enumerate(ytest):
    if y != 1 and y != 0:
        ytest[idx] = 1.0
c = np.zeros((len(ytest), 2))
c[np.arange(len(ytest)), ytest.astype(int)] = 1

time: 200 ms


In [947]:
import tensorflow as tf

time: 1.23 ms


In [1037]:
x = tf.placeholder(tf.float32, [None, 7])
W = tf.Variable(tf.zeros([7, 2]))
b = tf.Variable(tf.zeros([2]))
y = tf.matmul(x, W) + b
y_ = tf.placeholder(tf.float32, [None, 2])

time: 56 ms


In [1038]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

time: 789 ms


In [1039]:
for _ in range(25):
    sess.run(train_step, feed_dict={x: xtrain, y_: bb})

time: 6.93 s


In [1040]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

time: 34.3 ms


In [1041]:
1-ytest.mean()

0.51161279725512387

time: 11.8 ms


In [1042]:
print(sess.run(accuracy, feed_dict={x: xtest, y_: c}))

0.516763
time: 116 ms


In [None]:
# Create the model
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x, W) + b

# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, 10])

# The raw formulation of cross-entropy,
#
#   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
#                                 reduction_indices=[1]))
#
# can be numerically unstable.
#
# So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
# outputs of 'y', and then average across the batch.
cross_entropy = tf.reduce_mean(
  tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
# Train
for _ in range(1000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

# Test trained model
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images,
                                  y_: mnist.test.labels}))

In [1092]:
test = pd.read_csv('/Users/JasonKatz/Desktop/Raw_Stock_Data/BAC_data')

time: 66.7 ms


In [1096]:
test

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2002-08-01,33.12,33.14,32.22,32.40,14339400
1,2002-08-02,32.28,32.36,31.34,31.76,13136000
2,2002-08-05,31.76,31.88,30.90,31.03,14106800
3,2002-08-06,31.62,32.77,31.58,31.86,12956800
4,2002-08-07,32.42,32.62,31.98,32.62,13074800
5,2002-08-08,32.87,34.35,32.82,34.20,16330200
6,2002-08-09,34.12,34.70,33.57,34.40,11033800
7,2002-08-12,33.92,34.44,33.45,34.22,9462600
8,2002-08-13,34.22,35.00,33.66,33.68,13264000
9,2002-08-14,33.68,34.65,33.25,34.50,15384600


time: 40 ms


In [1167]:
MA5 = test['Close'].shift().rolling(window=5).mean()
BIAS5 = (test['Close'].shift() - MA5.shift())/MA5.shift()
positive = pd.Series(np.where(test['Close'] >= test['Open'], 1, 0))
PSY10 = positive.shift().rolling(window=10).sum()
SY = np.log(test['Close']) - np.log(test['Close'].shift())
ASY1 = SY.shift()
ASY2 = SY.shift().rolling(window=2).mean()
ASY3 = SY.shift().rolling(window=3).mean()
ASY4 = SY.shift().rolling(window=4).mean()
ASY5 = SY.shift().rolling(window=5).mean()

time: 19.2 ms


In [1188]:
BIAS5 = []
result = []
PSY10 = []
ASY1 = []
ASY2 = []
ASY3 = []
ASY4 = []
ASY5 = []
for file in os.listdir('/Users/JasonKatz/Desktop/Raw_Stock_Data')[1:]:
    data = pd.read_csv('/Users/JasonKatz/Desktop/Raw_Stock_Data/' + file)
    MA5 = test['Close'].shift().rolling(window=5).mean()
    BIAS5.extend(((test['Close'].shift() - MA5.shift())/MA5.shift())[10:])
    positive = pd.Series(np.where(test['Close'] >= test['Open'], 1, 0))
    result.extend(positive[10:])
    PSY10.extend((positive.shift().rolling(window=10).sum())[10:])
    SY = np.log(test['Close']) - np.log(test['Close'].shift())
    ASY1.extend((SY.shift())[10:])
    ASY2.extend((SY.shift().rolling(window=2).mean())[10:])
    ASY3.extend((SY.shift().rolling(window=3).mean())[10:])
    ASY4.extend((SY.shift().rolling(window=4).mean())[10:])
    ASY5.extend((SY.shift().rolling(window=5).mean())[10:])

time: 1min 29s


In [1245]:
predictors = np.column_stack((scale(BIAS5), scale(PSY10), scale(ASY1), scale(ASY2), scale(ASY3), scale(ASY4), scale(ASY5)))

time: 1min 21s


In [1246]:
xtrain, xtest, ytrain, ytest = train_test_split(predictors, result)

time: 1min 15s


In [1247]:
x = tf.placeholder(tf.float32, [None, 7])
W = tf.Variable(tf.zeros([7, 2]))
b = tf.Variable(tf.zeros([2]))
y = tf.matmul(x, W) + b
y_ = tf.placeholder(tf.float32, [None, 2])

time: 6.76 s


In [1248]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()

time: 1.76 s


In [1249]:
ytrain_hot = np.zeros((len(ytrain), 2))
ytrain_hot[np.arange(len(ytrain)), ytrain] = 1

time: 9.48 s


In [1250]:
ytest_hot = np.zeros((len(ytest), 2))
ytest_hot[np.arange(len(ytest)), ytest] = 1

time: 2.96 s


In [1251]:
for _ in range(25):
    sess.run(train_step, feed_dict={x: xtrain, y_: ytrain_hot})

time: 1min 23s


In [1252]:
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

time: 269 ms


In [1253]:
np.mean(ytest)

0.5034339298333137

time: 3.51 s


In [1254]:
print(sess.run(accuracy, feed_dict={x: xtest, y_: ytest_hot}))

0.537234
time: 4.51 s
