In [1]:
import bs4 as bs
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
import os
import pandas as pd
import pandas_datareader.data as web
import pickle
import requests
from collections import Counter
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn import svm, neighbors
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)

    for i in range(1,hm_days+1):
        df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

    df.fillna(0, inplace=True)
    return tickers, df

a, b = process_data_for_labels('XOM')

def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

In [2]:
def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)

    df['{}_target'.format(ticker)] = list(map( buy_sell_hold,
                                               df['{}_1d'.format(ticker)],
                                               df['{}_2d'.format(ticker)],
                                               df['{}_3d'.format(ticker)],
                                               df['{}_4d'.format(ticker)],
                                               df['{}_5d'.format(ticker)],
                                               df['{}_6d'.format(ticker)],
                                               df['{}_7d'.format(ticker)] ))
    #print(df.head())
    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:',Counter(str_vals))
    df.fillna(0, inplace=True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)
    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)
    X = df_vals.values
    y = df['{}_target'.format(ticker)].values

    return X,y,df
#extract_featuresets('XOM')

In [3]:
def do_ml(ticker):
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])

    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('accuracy:', confidence)
    predictions = clf.predict(X_test)
    print('predicted class counts:', Counter(predictions))
    print()
    print()
    return confidence


# examples of running:
do_ml('XOM')
do_ml('AAPL')
do_ml('ABT')

Data spread: Counter({'-1': 1347, '1': 1224, '0': 59})




accuracy: 0.5045592705167173
predicted class counts: Counter({-1: 434, 1: 223, 0: 1})


Data spread: Counter({'-1': 1376, '1': 1201, '0': 53})




accuracy: 0.5501519756838906
predicted class counts: Counter({-1: 376, 1: 281, 0: 1})


Data spread: Counter({'-1': 1340, '1': 1215, '0': 75})




accuracy: 0.47416413373860183
predicted class counts: Counter({-1: 401, 1: 256, 0: 1})




0.47416413373860183

In [None]:
from statistics import mean

with open("sp500tickers.pickle","rb") as f:
    tickers = pickle.load(f)

accuracies = []
for count,ticker in enumerate(tickers):
    for ticker in tickers:
        ticker = ticker.strip("\n")
        if "." in ticker:
            ticker = ticker.replace(".","-")
    if count%10==0:
        print(count)

    accuracy = do_ml(ticker)
    accuracies.append(accuracy)
    print("{} accuracy: {}. Average accuracy:{}".format(ticker,accuracy,mean(accuracies)))

0
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5304878048780488
predicted class counts: Counter({-1: 356, 1: 168, 0: 132})


ZTS accuracy: 0.5304878048780488. Average accuracy:0.5304878048780488
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5548780487804879
predicted class counts: Counter({-1: 357, 1: 159, 0: 140})


ZTS accuracy: 0.5548780487804879. Average accuracy:0.5426829268292683
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4878048780487805
predicted class counts: Counter({-1: 391, 1: 152, 0: 113})


ZTS accuracy: 0.4878048780487805. Average accuracy:0.524390243902439
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5091463414634146
predicted class counts: Counter({-1: 339, 1: 187, 0: 130})


ZTS accuracy: 0.5091463414634146. Average accuracy:0.520579268292683
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5121951219512195
predicted class counts: Counter({-1: 336, 1: 176, 0: 144})


ZTS accuracy: 0.5121951219512195. Average accuracy:0.5189024390243903
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.524390243902439
predicted class counts: Counter({-1: 298, 1: 210, 0: 148})


ZTS accuracy: 0.524390243902439. Average accuracy:0.5198170731707317
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5121951219512195
predicted class counts: Counter({-1: 370, 1: 145, 0: 141})


ZTS accuracy: 0.5121951219512195. Average accuracy:0.5187282229965157
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5274390243902439
predicted class counts: Counter({-1: 307, 1: 198, 0: 151})


ZTS accuracy: 0.5274390243902439. Average accuracy:0.5198170731707317
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.510670731707317
predicted class counts: Counter({-1: 332, 1: 218, 0: 106})


ZTS accuracy: 0.510670731707317. Average accuracy:0.5188008130081301
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5182926829268293
predicted class counts: Counter({-1: 366, 1: 164, 0: 126})


ZTS accuracy: 0.5182926829268293. Average accuracy:0.51875
10
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.49390243902439024
predicted class counts: Counter({-1: 403, 1: 129, 0: 124})


ZTS accuracy: 0.49390243902439024. Average accuracy:0.5164911308203991
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4801829268292683
predicted class counts: Counter({-1: 416, 1: 134, 0: 106})


ZTS accuracy: 0.4801829268292683. Average accuracy:0.5134654471544715
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5213414634146342
predicted class counts: Counter({-1: 309, 1: 213, 0: 134})


ZTS accuracy: 0.5213414634146342. Average accuracy:0.5140712945590995
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5167682926829268
predicted class counts: Counter({-1: 333, 1: 188, 0: 135})


ZTS accuracy: 0.5167682926829268. Average accuracy:0.51426393728223
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5350609756097561
predicted class counts: Counter({-1: 370, 1: 150, 0: 136})


ZTS accuracy: 0.5350609756097561. Average accuracy:0.515650406504065
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5457317073170732
predicted class counts: Counter({-1: 323, 1: 184, 0: 149})


ZTS accuracy: 0.5457317073170732. Average accuracy:0.5175304878048781
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5076219512195121
predicted class counts: Counter({-1: 353, 1: 168, 0: 135})


ZTS accuracy: 0.5076219512195121. Average accuracy:0.5169476327116213
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5030487804878049
predicted class counts: Counter({-1: 350, 1: 194, 0: 112})


ZTS accuracy: 0.5030487804878049. Average accuracy:0.5161754742547425
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.49085365853658536
predicted class counts: Counter({-1: 351, 1: 160, 0: 145})


ZTS accuracy: 0.49085365853658536. Average accuracy:0.5148427471116817
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5167682926829268
predicted class counts: Counter({-1: 306, 1: 215, 0: 135})


ZTS accuracy: 0.5167682926829268. Average accuracy:0.5149390243902439
20
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5274390243902439
predicted class counts: Counter({-1: 325, 1: 184, 0: 147})


ZTS accuracy: 0.5274390243902439. Average accuracy:0.515534262485482
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5533536585365854
predicted class counts: Counter({-1: 350, 1: 185, 0: 121})


ZTS accuracy: 0.5533536585365854. Average accuracy:0.5172533259423503
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5365853658536586
predicted class counts: Counter({-1: 333, 1: 180, 0: 143})


ZTS accuracy: 0.5365853658536586. Average accuracy:0.5180938494167551
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5365853658536586
predicted class counts: Counter({-1: 362, 1: 155, 0: 139})


ZTS accuracy: 0.5365853658536586. Average accuracy:0.5188643292682927
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4801829268292683
predicted class counts: Counter({-1: 416, 1: 123, 0: 117})


ZTS accuracy: 0.4801829268292683. Average accuracy:0.5173170731707317
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5091463414634146
predicted class counts: Counter({-1: 320, 1: 216, 0: 120})


ZTS accuracy: 0.5091463414634146. Average accuracy:0.5170028142589118
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4878048780487805
predicted class counts: Counter({-1: 343, 1: 188, 0: 125})


ZTS accuracy: 0.4878048780487805. Average accuracy:0.5159214092140921
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.48628048780487804
predicted class counts: Counter({-1: 388, 1: 148, 0: 120})


ZTS accuracy: 0.48628048780487804. Average accuracy:0.5148628048780488
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.510670731707317
predicted class counts: Counter({-1: 304, 1: 227, 0: 125})


ZTS accuracy: 0.510670731707317. Average accuracy:0.5147182506307821
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5198170731707317
predicted class counts: Counter({-1: 359, 1: 179, 0: 118})


ZTS accuracy: 0.5198170731707317. Average accuracy:0.5148882113821138
30
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5274390243902439
predicted class counts: Counter({-1: 342, 1: 202, 0: 112})


ZTS accuracy: 0.5274390243902439. Average accuracy:0.5152930763178599
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5350609756097561
predicted class counts: Counter({-1: 380, 0: 145, 1: 131})


ZTS accuracy: 0.5350609756097561. Average accuracy:0.5159108231707317
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5472560975609756
predicted class counts: Counter({-1: 340, 1: 169, 0: 147})


ZTS accuracy: 0.5472560975609756. Average accuracy:0.516860679970436
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.48628048780487804
predicted class counts: Counter({-1: 335, 1: 194, 0: 127})


ZTS accuracy: 0.48628048780487804. Average accuracy:0.515961262553802
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5076219512195121
predicted class counts: Counter({-1: 343, 1: 193, 0: 120})


ZTS accuracy: 0.5076219512195121. Average accuracy:0.5157229965156794
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5121951219512195
predicted class counts: Counter({-1: 388, 1: 135, 0: 133})


ZTS accuracy: 0.5121951219512195. Average accuracy:0.515625
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5060975609756098
predicted class counts: Counter({-1: 340, 1: 198, 0: 118})


ZTS accuracy: 0.5060975609756098. Average accuracy:0.5153675016479895
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5091463414634146
predicted class counts: Counter({-1: 336, 1: 177, 0: 143})


ZTS accuracy: 0.5091463414634146. Average accuracy:0.5152037869062901
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4740853658536585
predicted class counts: Counter({-1: 319, 1: 233, 0: 104})


ZTS accuracy: 0.4740853658536585. Average accuracy:0.5141494684177611
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5320121951219512
predicted class counts: Counter({-1: 365, 1: 158, 0: 133})


ZTS accuracy: 0.5320121951219512. Average accuracy:0.5145960365853659
40
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5426829268292683
predicted class counts: Counter({-1: 353, 1: 154, 0: 149})


ZTS accuracy: 0.5426829268292683. Average accuracy:0.5152810826888756
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4725609756097561
predicted class counts: Counter({-1: 382, 1: 139, 0: 135})


ZTS accuracy: 0.4725609756097561. Average accuracy:0.51426393728223
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.510670731707317
predicted class counts: Counter({-1: 336, 1: 178, 0: 142})


ZTS accuracy: 0.510670731707317. Average accuracy:0.5141803743618831
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5228658536585366
predicted class counts: Counter({-1: 314, 1: 205, 0: 137})


ZTS accuracy: 0.5228658536585366. Average accuracy:0.5143777716186253
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5152439024390244
predicted class counts: Counter({-1: 363, 1: 161, 0: 132})


ZTS accuracy: 0.5152439024390244. Average accuracy:0.5143970189701897
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.49390243902439024
predicted class counts: Counter({-1: 365, 1: 181, 0: 110})


ZTS accuracy: 0.49390243902439024. Average accuracy:0.5139514846235419
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5076219512195121
predicted class counts: Counter({-1: 349, 1: 174, 0: 133})


ZTS accuracy: 0.5076219512195121. Average accuracy:0.5138168137000518
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5060975609756098
predicted class counts: Counter({-1: 324, 1: 183, 0: 149})


ZTS accuracy: 0.5060975609756098. Average accuracy:0.5136559959349594
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5457317073170732
predicted class counts: Counter({-1: 354, 1: 153, 0: 149})


ZTS accuracy: 0.5457317073170732. Average accuracy:0.5143106022896964
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5274390243902439
predicted class counts: Counter({-1: 352, 1: 164, 0: 140})


ZTS accuracy: 0.5274390243902439. Average accuracy:0.5145731707317073
50
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5503048780487805
predicted class counts: Counter({-1: 361, 0: 150, 1: 145})


ZTS accuracy: 0.5503048780487805. Average accuracy:0.5152737924438068
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5579268292682927
predicted class counts: Counter({-1: 342, 1: 175, 0: 139})


ZTS accuracy: 0.5579268292682927. Average accuracy:0.5160940431519699
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.538109756097561
predicted class counts: Counter({-1: 368, 1: 166, 0: 122})


ZTS accuracy: 0.538109756097561. Average accuracy:0.5165094339622641
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5335365853658537
predicted class counts: Counter({-1: 334, 1: 167, 0: 155})


ZTS accuracy: 0.5335365853658537. Average accuracy:0.5168247515808492
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5457317073170732
predicted class counts: Counter({-1: 391, 0: 138, 1: 127})


ZTS accuracy: 0.5457317073170732. Average accuracy:0.517350332594235
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5396341463414634
predicted class counts: Counter({-1: 363, 0: 155, 1: 138})


ZTS accuracy: 0.5396341463414634. Average accuracy:0.5177482578397212
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.49847560975609756
predicted class counts: Counter({-1: 349, 1: 204, 0: 103})


ZTS accuracy: 0.49847560975609756. Average accuracy:0.5174101412066753
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5274390243902439
predicted class counts: Counter({-1: 348, 1: 157, 0: 151})


ZTS accuracy: 0.5274390243902439. Average accuracy:0.5175830529857023
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4740853658536585
predicted class counts: Counter({-1: 366, 1: 180, 0: 110})


ZTS accuracy: 0.4740853658536585. Average accuracy:0.5168458040512609
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5503048780487805
predicted class counts: Counter({-1: 357, 0: 154, 1: 145})


ZTS accuracy: 0.5503048780487805. Average accuracy:0.5174034552845529
60
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5
predicted class counts: Counter({-1: 337, 1: 168, 0: 151})


ZTS accuracy: 0.5. Average accuracy:0.5171181527389045
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.4893292682926829
predicted class counts: Counter({-1: 345, 1: 173, 0: 138})


ZTS accuracy: 0.4893292682926829. Average accuracy:0.5166699449252558
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5045731707317073
predicted class counts: Counter({-1: 392, 1: 157, 0: 107})


ZTS accuracy: 0.5045731707317073. Average accuracy:0.5164779326364692
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5365853658536586
predicted class counts: Counter({-1: 309, 1: 238, 0: 109})


ZTS accuracy: 0.5365853658536586. Average accuracy:0.5167921112804879
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5625
predicted class counts: Counter({-1: 322, 1: 193, 0: 141})


ZTS accuracy: 0.5625. Average accuracy:0.5174953095684803
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5457317073170732
predicted class counts: Counter({-1: 365, 0: 151, 1: 140})


ZTS accuracy: 0.5457317073170732. Average accuracy:0.5179231337767923
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5152439024390244
predicted class counts: Counter({-1: 328, 1: 186, 0: 142})


ZTS accuracy: 0.5152439024390244. Average accuracy:0.517883145249363
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5259146341463414
predicted class counts: Counter({-1: 356, 1: 170, 0: 130})


ZTS accuracy: 0.5259146341463414. Average accuracy:0.5180012553802008
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5167682926829268
predicted class counts: Counter({-1: 320, 1: 219, 0: 117})


ZTS accuracy: 0.5167682926829268. Average accuracy:0.5179833863556027
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5030487804878049
predicted class counts: Counter({-1: 385, 1: 151, 0: 120})


ZTS accuracy: 0.5030487804878049. Average accuracy:0.5177700348432056
70
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.49085365853658536
predicted class counts: Counter({-1: 318, 1: 204, 0: 134})


ZTS accuracy: 0.49085365853658536. Average accuracy:0.5173909309515631
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5335365853658537
predicted class counts: Counter({-1: 347, 1: 157, 0: 152})


ZTS accuracy: 0.5335365853658537. Average accuracy:0.5176151761517616
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5091463414634146
predicted class counts: Counter({-1: 347, 1: 193, 0: 116})


ZTS accuracy: 0.5091463414634146. Average accuracy:0.5174991647176745
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5304878048780488
predicted class counts: Counter({-1: 345, 1: 181, 0: 130})


ZTS accuracy: 0.5304878048780488. Average accuracy:0.5176746868820039
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.510670731707317
predicted class counts: Counter({-1: 386, 1: 160, 0: 110})


ZTS accuracy: 0.510670731707317. Average accuracy:0.5175813008130081
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5060975609756098
predicted class counts: Counter({-1: 324, 1: 205, 0: 127})


ZTS accuracy: 0.5060975609756098. Average accuracy:0.5174301989730423
Data spread: Counter({'-1': 921, '1': 905, '0': 804})




accuracy: 0.5015243902439024
predicted class counts: Counter({-1: 348, 1: 186, 0: 122})


ZTS accuracy: 0.5015243902439024. Average accuracy:0.5172236300285081
Data spread: Counter({'-1': 921, '1': 905, '0': 804})
