In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LogisticRegression
import sklearn
import pandas_datareader.data as web
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC #SVM: first we try to predict stock prices with the help of linear support vector classifier

In [2]:
def create_dataset(stock_symbol,start_date,end_date,lags=5):
    
    #fetch stock data from yahoo finance
    df = web.DataReader(stock_symbol, 'yahoo', start_date, end_date)
    
    #create new dataframe
    #we want to use additional features: lagged returns...today's returns, yesterday's returns etc.
    dflag = pd.DataFrame(index=df.index)
    dflag['Today'] = df['Adj Close']
    dflag['Volume'] = df['Volume']
    
    #create the shifted lag series of prior trading period close values
    for i in range(0,lags):
        dflag['Lag%s' % str(i+1)] = df['Adj Close'].shift(i+1)
    
    #create the returns dataframe
    dfret = pd.DataFrame(index=dflag.index)
    dfret['Volume'] = dflag['Volume']
    dfret['Today'] = dflag['Today'].pct_change()*100
    
    #create the lagged percentage returns columns
    for i in range(0,lags):
        dfret['Lag%s' % str(i+1)] = dflag['Lag%s' % str(i+1)].pct_change()*100
    
    #'Direction' column (+1 or -1) indicating positive or negative return way
    dfret['Direction'] = np.sign(dfret['Today'])
    
    #because of the shifts there are NaN values, we want to get rid of them
    dfret.dropna(inplace = True)
    
    return dfret

In [4]:
data = create_dataset('AAPL',datetime(2012,1,1),datetime(2017,5,31),lags=5)

#Use the prior 4 days of returns as predictor
#values, with direction as the response
X = data[['Lag1','Lag2','Lag3','Lag4']] #features
Y = data['Direction'] #target variable

#The test data is split into 2 parts: Before and after 1st Jan 2017
start_test = datetime(2017,1,1)

#Create training and test sets
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
Y_train = Y[Y.index < start_test]
Y_test = Y[Y.index >= start_test]

'''________________________________________________________________________________________'''
'''LOGISTIC REGRESSION'''

#We use Logistic regression as the machine learning model
LR_model = LogisticRegression()

#train the model on the training set
LR_model.fit(X_train, Y_train)

#make an array of predictions on the test set
LR_pred = LR_model.predict(X_test)

#output the hit-rate and the confusion matrix for the model
#We see if our predictions are the same as in Y_test
print('Accuracy of logistic regression model: %0.3f' % LR_model.score(X_test, Y_test))
print('Confusion matrix: \n%s' % confusion_matrix(LR_pred, Y_test))

'''________________________________________________________________________________________'''
'''KNN CLASSIFIER'''

#We use kNN as the machine learning model
kNN_model = KNeighborsClassifier(300) # k = 300

#train the model on the training set
kNN_model.fit(X_train,Y_train)

#make an array of predictions on the test set
kNN_pred = kNN_model.predict(X_test)

#output the hit-rate and the confusion matrix for the model
#We see if our predictions are the same as in Y_test
print('Accuracy of kNN model: %0.3f' % kNN_model.score(X_test, Y_test))
print('Confusion matrix: \n%s' % confusion_matrix(kNN_pred, Y_test))

'''________________________________________________________________________________________'''
'''SVM CLASSIFIER'''

#We use SVM as the machine learning model
SVM_model = LinearSVC()
#We now try the Gaussian RBF Kernel
SVM_RBF_model = SVC(C=1000000.0, cache_size=200,class_weight=None,coef0=0.0,degree=3,gamma=0.0001,kernel='rbf',max_iter=-1,probability=False,random_state=None,shrinking=True,tol=0.001,verbose=False)

#train the model on the training set
SVM_model.fit(X_train,Y_train)
SVM_RBF_model.fit(X_train,Y_train)

#make an array of predictions on the test set
SVM_pred = SVM_model.predict(X_test)
SVM_RBF_pred = SVM_RBF_model.predict(X_test)

#output the hit-rate and the confusion matrix for the model
#We see if our predictions are the same as in Y_test
print('Accuracy of SVM model: %0.3f' % SVM_model.score(X_test, Y_test))
print('Confusion matrix: \n%s' % confusion_matrix(SVM_pred, Y_test))
print('Accuracy of SVM RBF model: %0.3f' % SVM_RBF_model.score(X_test, Y_test))
print('Confusion matrix: \n%s' % confusion_matrix(SVM_RBF_pred, Y_test))

  result = getattr(ufunc, method)(*inputs, **kwargs)


                Volume     Today      Lag1      Lag2      Lag3      Lag4  \
Date                                                                       
2012-01-11  53771200.0 -0.163035  0.358055 -0.158618  1.045340  1.110220   
2012-01-12  53146800.0 -0.274512 -0.163035  0.358055 -0.158618  1.045340   
2012-01-13  56505400.0 -0.374959 -0.274512 -0.163035  0.358055 -0.158618   
2012-01-17  60724300.0  1.164819 -0.374959 -0.274512 -0.163035  0.358055   
2012-01-18  69197800.0  1.038398  1.164819 -0.374959 -0.274512 -0.163035   
...                ...       ...       ...       ...       ...       ...   
2017-05-25  19235600.0  0.345612 -0.299086 -0.123382  0.607619  0.340889   
2017-05-26  21927600.0 -0.168930  0.345612 -0.299086 -0.123382  0.607619   
2017-05-30  20126900.0  0.039048 -0.168930  0.345612 -0.299086 -0.123382   
2017-05-31  24451200.0 -0.592189  0.039048 -0.168930  0.345612 -0.299086   
2017-06-01  16404100.0  0.274941 -0.592189  0.039048 -0.168930  0.345612   

           



Accuracy of SVM model: 0.577
Confusion matrix: 
[[ 7  7]
 [37 53]]
Accuracy of SVM RBF model: 0.538
Confusion matrix: 
[[ 4  8]
 [40 52]]


In [5]:
#Accuracy for LR is approx 57%, not that good because like tossing a coin, because market movements are random
#Accuracy for kNN is approx 62% with k = 300 not too bad for a ML algorithm