In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.io import data, wb
import datetime
import os.path
import math

from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
    
from sklearn.externals import joblib

import talib as ta
from pandas.tseries.offsets import BDay


In [76]:
def predict(symbol):
    df_train = pd.read_csv('data/'+symbol+'-ohlc.csv', index_col='Date', parse_dates=True)
    if (os.path.isfile('data/'+symbol+'-dividends.csv')):
        df_dividends = pd.read_csv('data/'+symbol+'-dividends.csv', index_col='Date', parse_dates=True)
        df_train = df_train.join(df_dividends)
        df_train['Dividends'].fillna(0, inplace=True)
    else:
        df_train['Dividends']=0
        

    pma10 = df_train['Close'].rolling(window=10,center=False).mean() + df_train['Dividends'].rolling(window=10, center=False).mean()
    pma20 = df_train['Close'].rolling(window=20,center=False).mean() + df_train['Dividends'].rolling(window=20, center=False).mean()
    pma50 = df_train['Close'].rolling(window=50,center=False).mean() + df_train['Dividends'].rolling(window=50, center=False).mean()
    pma200 = df_train['Close'].rolling(window=200,center=False).mean() + df_train['Dividends'].rolling(window=200, center=False).mean()
    vma50 = df_train['Volume'].rolling(window=50, center=False).mean()

    #distance from MAs
    diffma10  = (df_train['Close']/pma10 -1)*100
    diffma20 = (df_train['Close']/pma20 -1)*100
    diffma50 = (df_train['Close']/pma50 -1)*100
    diffma200 = (df_train['Close']/pma200 -1)*100

    #tangential of ma lines
    pma10tan = (pma10[1:]/pma10[:-1].values-1)*100
    pma20tan = (pma20[1:]/pma20[:-1].values-1)*100
    pma50tan = (pma50[1:]/pma50[:-1].values-1)*100
    pma200tan = (pma200[1:]/pma200[:-1].values-1)*100

    #sequence of higher high/higher low
    hh = df_train['High'][1:] + df_train['Dividends'][1:] > df_train['High'][:-1] 
    hl = df_train['Low'][1:] + df_train['Dividends'][1:] > df_train['Low'][:-1] 
    hhx1 = hh.shift(1)
    hhx2 = hh.shift(2)
    hlx1 = hl.shift(1)
    hlx2 = hl.shift(2)

    #rolling average of hh and hl
    hh10 = hh.rolling(window=10, center=False).mean()
    hl10 = hl.rolling(window=10, center=False).mean()

    nvx1 = df_train['Volume']/vma50

    rhigh = (df_train['High']/df_train['Close'] - 1)*100
    rlow = (df_train['Low']/df_train['Close'] - 1)*100
    ropen = (df_train['Open'][1:]/df_train['Close'][:-1].values-1)*100
    rbody = ((df_train['Open'] - df_train['Close'])/df_train['Close'])*100

    rhighx1 = rhigh.shift(1)
    rlowx1 = rlow.shift(1)
    ropenx1 = ropen.shift(1)
    rbodyx1 = rbody.shift(1)

    high = df_train['High'].values
    low = df_train['Low'].values
    close = df_train['Close'].values
    #volume = df_train['Volume'].values
    #open = df_train['Open'].values
    #adjclose = df_train['Adj Close'].values

    #rsi = ta.RSI(adjclose, timeperiod=10)/100
    #slowk, slowd = ta.STOCH(high, low, close, 
    #                        fastk_period=5, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
    #slowk = slowk/100 #feature scaling
    #fastk, fastd = ta.STOCHF(high, low, close, fastk_period=5, fastd_period=3, fastd_matype=0)

    #macd, macdsignal, macdhist = ta.MACD(close, fastperiod=6, slowperiod=13, signalperiod=4)
    #mfi = ta.MFI(high, low, close, volume/1.0, timeperiod=10)/100
    #bop = ta.SMA(ta.BOP(open, high, low, close),
    #             timeperiod=3)
    
    
    #add features for training data

    #distance from MAs
    df_train = df_train.assign(diffma10=diffma10)
    df_train = df_train.assign(diffma20=diffma20)
    df_train = df_train.assign(diffma50=diffma50)
    df_train = df_train.assign(diffma200=diffma200)

    #tangential of ma lines
    #df_train = df_train.assign(pma10tan=pma10tan)
    #df_train = df_train.assign(pma20tan=pma20tan)
    #df_train = df_train.assign(pma50tan=pma50tan)
    #df_train = df_train.assign(pma200tan=pma200tan)

    df_train = df_train.assign(hh=hh)
    #df_train = df_train.assign(hhx1=hhx1)
    #df_train = df_train.assign(hhx1=hhx2)

    df_train = df_train.assign(hl=hl)
    #df_train = df_train.assign(hlx1=hlx1)
    #df_train = df_train.assign(hlx2=hlx2)

    df_train = df_train.assign(hh10=hh10)
    df_train = df_train.assign(hl10=hl10)

    #last day volume
    df_train = df_train.assign(nvx1=nvx1)

    #candle sticks
    df_train = df_train.assign(ropen=ropen)
    df_train = df_train.assign(rhigh=rhigh)
    df_train = df_train.assign(rlow=rlow)
    df_train = df_train.assign(rbody=rbody)

    df_train = df_train.assign(ropenx1=ropenx1)
    df_train = df_train.assign(rhighx1=rhighx1)
    df_train = df_train.assign(rlowx1=rlowx1)
    df_train = df_train.assign(rbodyx1=rbodyx1)


    #df_train = df_train.assign(rsi=rsi)
    #df_train = df_train.assign(slowk=slowk)
    #df_train = df_train.assign(macd=macd)
    #df_train = df_train.assign(mfi=mfi)
    #df_train = df_train.assign(bop=bop)
    #df_train = df_train.assign(fastk=fastk)
    
    atr = ta.ATR(high, low, close, timeperiod=10)[-1]
    atr = '%.2f' % round(atr, 2)
    
    adjhigh = high[-1] - df_train['Dividends'][-1]
    adjlow = low[-1] - df_train['Dividends'][-1]
    
    df_train.to_csv('data/'+symbol+'-train.csv')
    remove = ['Open','High','Low','Close','Volume','Adj Close','Dividends']
    df_train = df_train.drop(remove, axis=1)
    
    #df_train.describe()
    #df_train.corr()
    
    X_train = df_train.values[200:-11, :]
    X_test = df_train.values[-11:,:]
    rowCount = len(X_train)

    #for hh, shift one position back to get future value for hh
    y_train = df_train['hh'].shift(-1).values[200:-11].astype(bool)
    model, hh_score  = train(X_train, y_train)
    y_test_pred = model.predict_proba(X_test);
    prob_hh = y_test_pred[-1,1]    
    #predict score on the most recent 10 period(out of sample test)
    recent_hh_score = sum(df_train['hh'].values[-10:].astype(bool) == (y_test_pred[:, 1]>0.5)[:10])

    #for hl
    y_train = df_train['hl'].shift(-1).values[200:-11].astype(bool)        
    model, hl_score = train(X_train, y_train)
    #save_model('hl', symbol)
    y_test_pred = model.predict_proba(X_test);
    prob_hl = y_test_pred[-1,1]
    #predict score on the most recent 10 period(out of sample test)
    recent_hl_score = sum(df_train['hl'].values[-10:].astype(bool) == (y_test_pred[:, 1]>0.5)[:10])
    
    pdate = df_train.index[-1] + BDay(1)

    return pdate, atr, prob_hh, prob_hl, hh_score, hl_score, recent_hh_score, recent_hl_score, rowCount, adjhigh, adjlow


In [77]:
def save_model(target, symbol):
    model_dir = 'models/predict-'+target+'-'+symbol
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    joblib.dump(model, f_name) 

In [78]:
def train(X_train, y_train):  

    model = RandomForestClassifier(n_estimators=350)     #best so far
    #model = AdaBoostClassifier(n_estimators=100)
    #model = LogisticRegression(random_state=1)
    #model = GradientBoostingClassifier(n_estimators=300)
    #model = KNeighborsClassifier(n_neighbors=15)

    
    scores = cross_validation.cross_val_score(model, X_train, y_train, cv=5, scoring='precision')
    mean = scores.mean()
    std = scores.std()
    scoreStr = '%.2f' % round(mean, 2) +"/"+'%.2f' % round(std, 2)
    
    model.fit(X_train, y_train)
    #print(model.feature_importances_)
    return model,scoreStr



In [79]:
def get_type(prob_hh, prob_hl):
    ptype = '..'
    if(prob_hh > 0.5):
        if(prob_hl < 0.5 ):
            ptype = '***'
        else:
            ptype = '++'
    else:
        if(prob_hl < 0.5):
            ptype = '--'
            
    return ptype

In [80]:
symbols = ['spy', 'qqq','iwm', 'eem', 'aapl', 'ibm', 'fb','goog', 'amzn','gs','intc', 'dva','noah', 'lc', 'renn', 'tsla', 'hog','600519.SS']
#symbols = ['gps']
params = ['atr', 'high','low', 'prob_hh','prob_hl', 'hh_score', 'hl_score','rhh_score', 'rhl_score', 'rowCount', 'pdate', 'type']
df_predicts = pd.DataFrame(columns=symbols, index=params)

for symbol in symbols:
    print('\npredicting ' + symbol)
    pdate, atr, prob_hh, prob_hl, hh_score, hl_score, recent_hh_score, recent_hl_score, rowCount, adjhigh, adjlow = predict(symbol)
    pdateStr = pdate.strftime("%Y-%m-%d")
    ptype = get_type(prob_hh, prob_hl)
    df_predicts[symbol] = (atr, '%.2f' % round(adjhigh, 2) , '%.2f' % round(adjlow, 2), '%.2f' % round(prob_hh, 2) , '%.2f' % round(prob_hl, 2), 
                           hh_score, hl_score, recent_hh_score, recent_hl_score, rowCount, pdateStr, ptype)
    print(symbol, atr,  adjhigh, adjlow, 'hh=%.2f' % round(prob_hh, 2), 'hl=%.2f' % round(prob_hl, 2), 
          hh_score, hl_score, recent_hh_score, recent_hl_score, rowCount, pdateStr, ptype)

df_predicts.to_csv('predictions-'+pdateStr+'.csv')
print("Prediction for "+pdateStr+" completed.")


predicting spy
spy 2.18 206.419998 203.089996 hh=0.16 hl=0.26 0.70/0.02 0.71/0.02 6 8 5629 2016-04-08 --

predicting qqq
qqq 1.39 110.230003 108.610001 hh=0.35 hl=0.36 0.70/0.06 0.69/0.05 7 8 4087 2016-04-08 --

predicting iwm
iwm 1.67 109.879997 108.129997 hh=0.39 hl=0.48 0.70/0.01 0.71/0.03 8 8 3779 2016-04-08 --

predicting eem
eem 0.59 33.110001 32.73 hh=0.35 hl=0.42 0.60/0.03 0.63/0.03 5 9 3058 2016-04-08 --

predicting aapl
aapl 1.94 110.419998 108.120003 hh=0.16 hl=0.27 0.69/0.02 0.73/0.04 4 6 8695 2016-04-08 --

predicting ibm
ibm 2.52 149.600006 148.0 hh=0.39 hl=0.24 0.69/0.03 0.72/0.04 6 8 13449 2016-04-08 --

predicting fb
fb 2.20 114.889999 113.07 hh=0.35 hl=0.39 0.70/0.04 0.73/0.02 6 5 766 2016-04-08 --

predicting goog
goog 11.41 747.0 736.280029 hh=0.47 hl=0.41 0.69/0.03 0.70/0.04 7 8 2718 2016-04-08 --

predicting amzn
amzn 13.30 599.599976 589.080017 hh=0.37 hl=0.28 0.69/0.04 0.74/0.02 7 8 4543 2016-04-08 --

predicting gs
gs 3.48 154.029999 149.369995 hh=0.10 hl=0.41

In [81]:
def e_predict(X_train, y_train, X_test):

    clf1 = LogisticRegression(random_state=1)
    clf2 = RandomForestClassifier(n_estimators=350)
    clf3 = GaussianNB()
    clf4 = GradientBoostingClassifier(n_estimators=100)
    clf5 = KNeighborsClassifier(n_neighbors=15)
    
    eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), 
                                        ('gb', clf4), ('knn',clf5)], voting='soft')

    for clf in [clf1, clf2, clf3, clf4, clf5, eclf]: 
        print(type(clf).__name__, "\n")
        scores = cross_validation.cross_val_score(clf, X_train, y_train, cv=5, scoring='precision')
        print("Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
        clf.fit(X_train, y_train)
        y_test_pred = clf.predict_proba(X_test);
        return y_test_pred
