In [709]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.io import data, wb
import datetime
import os.path
import math

from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn import linear_model

from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
    
from sklearn.externals import joblib

import talib as ta
from pandas.tseries.offsets import BDay

import urllib.request
import re


In [710]:
def predict(symbol, period=10):
    df_train = pd.read_csv('data/'+symbol+'-ohlc.csv', index_col='Date', parse_dates=True)
    if (os.path.isfile('data/'+symbol+'-dividends.csv')):
        df_dividends = pd.read_csv('data/'+symbol+'-dividends.csv', index_col='Date', parse_dates=True)
        df_train = df_train.join(df_dividends)
        df_train['Dividends'].fillna(0, inplace=True)
    else:
        df_train['Dividends']=0
        

    #features: 
    
    #distance from MAs
    pma10 = df_train['Close'].rolling(window=10).mean() + df_train['Dividends'].rolling(
        window=10).mean()
    pma20 = df_train['Close'].rolling(window=20).mean() + df_train['Dividends'].rolling(
        window=20).mean()
    pma50 = df_train['Close'].rolling(window=50).mean() + df_train['Dividends'].rolling(
        window=50).mean()
    pma200 = df_train['Close'].rolling(window=200).mean() + df_train['Dividends'].rolling(
        window=200).mean()
    
    diffma10  = (df_train['Close']/pma10 -1)*100
    diffma20 = (df_train['Close']/pma20 -1)*100
    diffma50 = (df_train['Close']/pma50 -1)*100
    diffma200 = (df_train['Close']/pma200 -1)*100
    
    #volume
    vma100 = df_train['Volume'].rolling(window=100).mean()
    vma = df_train['Volume'].rolling(window=period).mean()/vma100
    
    #min, max range lookback
    atr = ta.ATR(df_train['High'].values, df_train['Low'].values, df_train['Close'].values, timeperiod=10)
    rmax = (df_train['High'].rolling(window=period).max()-df_train['Close'])/atr
    rmin = (df_train['High'].rolling(window=period).min()-df_train['Close'])/atr
     
    
    
    #add features for training data
    df_train = df_train.assign(diffma10=diffma10)
    df_train = df_train.assign(diffma20=diffma20)
    df_train = df_train.assign(diffma50=diffma50)
    df_train = df_train.assign(diffma200=diffma200)

    df_train = df_train.assign(vma=vma)
    df_train = df_train.assign(rmax=rmax)
    df_train = df_train.assign(rmin=rmin)
    
    p_rmax = rmax.shift(-period)
    p_rmin = rmin.shift(-period)
    df_train = df_train.assign(p_rmax=p_rmax)
    df_train = df_train.assign(p_rmin=p_rmin)

    
    df_train.to_csv('data/'+symbol+'-train.csv')
    remove = ['Open','High','Low','Close','Volume','Adj Close','Dividends','p_rmax', 'p_rmin']
    df_train = df_train.drop(remove, axis=1)
    
    #df_train.describe()
    #df_train.corr()
    
    X_train = df_train.values[200:-(200+period), :]
    X_test = df_train.values[-(200+period):,:]
    rowCount = len(X_train)
    
    #for hh, shift one position back to get future value for hh
    y_train = df_train['rmax'].shift(-period).values[200:-(200+period)]
    model, score  = train(X_train, y_train)
    y_test_pred = model.predict(X_test);
    y_actual = df_train['rmax'].shift(-period).values[-(200+period):]
    r2 = model.score(X_test[:-period], y_actual[:-period])
    print('out of sample R2=',r2)
    
    diff = pd.Series(y_actual[:-period]) - pd.Series(y_test_pred[:-period])
    diff.plot(kind='bar')
    plt.show()
    
    '''
    #for hl
    y_train = df_train['hl'].shift(-1).values[200:-11].astype(bool)        
    model, hl_score = train(X_train, y_train)
    #save_model('hl', symbol)
    y_test_pred = model.predict_proba(X_test);
    prob_hl = y_test_pred[-1,1]
    #predict score on the most recent 10 period(out of sample test)
    recent_hl_score = sum(df_train['hl'].values[-10:].astype(bool) == (y_test_pred[:, 1]>0.5)[:10])
    
    pdate = df_train.index[-1] + BDay(1)
    '''
    return 3, 3


In [711]:
def save_model(target, symbol):
    model_dir = 'models/predict-'+target+'-'+symbol
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    joblib.dump(model, f_name) 

In [712]:
def train(X_train, y_train):  

    model = RandomForestRegressor(n_estimators=350)  
    #model = KNeighborsRegressor(n_neighbors=6) #6 is best for gold: r2 = 0.12
    #model = linear_model.BayesianRidge()
    #model = linear_model.SGDRegressor()
    
    
    
    
    
    #model = AdaBoostClassifier(n_estimators=100)
    #model = LogisticRegression(random_state=1)
    #model = GradientBoostingClassifier(n_estimators=300)
    #model = KNeighborsClassifier(n_neighbors=15)
    
    model.fit(X_train, y_train)
    sc = model.score(X_train, y_train)
    print('in sample R2 =', sc)
    
    #print(model.feature_importances_)
    return model, 1



In [713]:
def get_type(prob_hh, prob_hl):
    ptype = '4..'
    if(prob_hh > 0.5):
        if(prob_hl < 0.5 ):
            ptype = '1**'
        else:
            ptype = '2++'
    else:
        if(prob_hl < 0.5):
            ptype = '3--'
            
    return ptype

In [714]:
def get_signal(prob_hh, prob_hl):
    signal = 2
    if(prob_hh > 0.5):
        if(prob_hl > 0.5 ):
            signal = 1
        else:
            signal = 0
    else: #
        if(prob_hl < 0.5):
            signal = -1
            
    return signal

In [715]:
#symbols = ['SPY', 'QQQ','IWM', 'EEM', 'AAPL', 'GOOG', 'XOM', 'FB', 'JNJ','GE',  'AMZN','WFC', 'T', 'PG', 'WMT', 'JPM', 
#           'VZ', 'KO', 'PFE',  'V', 'HD', 'CVX', 'DIS', 'INTC', 'IBM', 'GS','DVA', 'MO', 'GILD', 'TSLA']
symbols = ['SPY']
params = ['atr', 'high','low', 'prob_hh','prob_hl', 'hh_score', 'hl_score','rhh_score', 'rhl_score', 'rowCount', 
          'pdate', 'score_diff','rrange','ptype', 'support', 'support_count', 'resistance', 'resis_count']
df_predicts = pd.DataFrame(columns=params, index=symbols)
df_signals = pd.DataFrame(columns=['Date','symbol','signal','atr','prev_high', 'prev_low','prev_close', 
                                   'support', 'supp_count','resistance', 'resis_count'])
index = 0
for symbol in symbols:
    symbol = symbol.upper()
    print('\npredicting ' + symbol)
    r_max, r_min = predict(symbol, 20)
    print(r_max,r_min)
    '''
    ptype = get_type(prob_hh, prob_hl)
    df_predicts.loc[symbol] = ('%.2f' % round(atr, 2), '%.2f' % round(adjhigh, 2) , '%.2f' % round(adjlow, 2), 
                               '%.2f' % round(prob_hh, 2) , '%.2f' % round(prob_hl, 2), hh_score, hl_score, recent_hh_score, 
                               recent_hl_score, rowCount, pdateStr, '%.2f' % round(prob_hh - prob_hl, 2), 
                               '%.2f' % round(atr/(adjhigh-adjlow), 2), ptype, 
                               support[0], support[1], resistance[0], resistance[1])
    print(symbol, atr,  adjhigh, adjlow, 'hh=%.2f' % round(prob_hh, 2), 'hl=%.2f' % round(prob_hl, 2), 
          hh_score, hl_score, recent_hh_score, recent_hl_score, rowCount, pdateStr, ptype)

    df_signals.loc[index] = [pdateStr, symbol, get_signal(prob_hh, prob_hl), '%.2f' % round(atr, 2), 
                             adjhigh, adjlow, adjclose, support[0], support[1], resistance[0], resistance[1]]
    index = index+1


#df_predicts.sort(['ptype', 'rrange']).to_csv('predictions-'+pdateStr+'.csv')
df_signals.to_csv('signal-today.txt')
print("Prediction for "+pdateStr+" completed.")
    '''


predicting SPY
in sample R2 = 0.913323504995
out of sample R2= -0.214412212601
3 3
