<a href="https://colab.research.google.com/github/jtao22/PythonAI/blob/main/StockMarket/stockmovementclassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#import 
import numpy as np
import pandas as pd
import pandas_datareader as pdr
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC

In [3]:
#upload data
data = pdr.DataReader('GOOG', data_source="yahoo", start = "2005-1-1")
data = data.dropna(axis=0)
data.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2005-01-03,101.439781,97.365051,98.331429,100.976517,31807176.0,100.976517
2005-01-04,101.086105,96.378746,100.323959,96.886841,27614921.0,96.886841
2005-01-05,98.082359,95.756081,96.363808,96.393692,16534946.0,96.393692
2005-01-06,97.584229,93.509506,97.175758,93.922951,20852067.0,93.922951
2005-01-07,96.762314,94.037521,94.96405,96.563057,19398238.0,96.563057


In [4]:
#create functions to calculate Simple Moving Average, Exponential Moving Average, Moving Average Convergence Divergence, Relative Strength Index
def SMA(data, p, column):
  return data[column].rolling(window=p).mean()

def EMA(data, p, column):
  return data[column].ewm(span=p, adjust = False).mean()

def MACD(data, plong, pshort, psignal, column):
  #short term EMA calculation
  sEMA = EMA(data, pshort, column)
  #long term EMA calculation
  lEMA = EMA(data, plong, column)
  #calculate and store final MACD into data frame
  data['MACD'] = sEMA - lEMA
  #calculate and store signal line
  data['Signal Line'] = EMA(data, psignal, 'MACD')
  return data

def RSI(data, p, column):
  d = data[column].diff(1)
  d = d.dropna()
  up = d.copy()
  down = d.copy()
  up[up<0] = 0
  down[down>0] = 0
  data['up'] = up
  data['down'] = down
  avg_gain = SMA(data,p,'up')
  avg_loss = abs(SMA(data,p,'down'))
  RS = avg_gain/avg_loss
  rsi = 100.0 - (100.0/(1.0+RS))
  data['RSI'] = rsi
  return data

In [5]:
#run functions
MACD(data,26,12,9,'Close')
RSI(data,14,'Close')
data['SMA'] = SMA(data,30,'Close')
data['EMA'] = EMA(data,20,'Close')
data.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,MACD,Signal Line,up,down,RSI,SMA,EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2005-01-03,101.439781,97.365051,98.331429,100.976517,31807176.0,100.976517,0.0,0.0,,,,,100.976517
2005-01-04,101.086105,96.378746,100.323959,96.886841,27614921.0,96.886841,-0.326242,-0.065248,0.0,-4.089676,,,100.587024
2005-01-05,98.082359,95.756081,96.363808,96.393692,16534946.0,96.393692,-0.617466,-0.175692,0.0,-0.493149,,,100.187659
2005-01-06,97.584229,93.509506,97.175758,93.922951,20852067.0,93.922951,-1.035693,-0.347692,0.0,-2.470741,,,99.59102
2005-01-07,96.762314,94.037521,94.96405,96.563057,19398238.0,96.563057,-1.140954,-0.506345,2.640106,0.0,,,99.302643


In [6]:
#create target
data['Target'] = np.where(data['Close'].shift(-1) > data['Close'], 1, 0)
data = data.dropna(axis = 0)
data.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,MACD,Signal Line,up,down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2005-02-14,96.179497,90.162048,91.083595,96.134666,77413481.0,96.134666,-0.535722,-0.350967,2.784569,0.0,60.203163,96.71449,96.87584,1
2005-02-15,99.546875,96.179497,96.438522,97.250481,51758881.0,97.250481,-0.441769,-0.369127,1.115814,0.0,54.411549,96.590289,96.91152,1
2005-02-16,99.292824,96.787216,96.986473,98.834541,33188534.0,98.834541,-0.236761,-0.342654,1.584061,0.0,57.388069,96.655212,97.094665,0
2005-02-17,99.502045,98.037529,98.545624,98.580498,20906871.0,98.580498,-0.093709,-0.292865,0.0,-0.254044,55.545773,96.728105,97.236173,1
2005-02-18,99.048744,97.962814,98.884354,98.6054,17035414.0,98.6054,0.021423,-0.230007,0.024902,0.0,51.851255,96.884187,97.366576,0


In [7]:
#split data
keep = ['Close', 'MACD', 'Signal Line', 'RSI', 'SMA', 'EMA']
X = data[keep].values
Y = data['Target'].values
#train test split
trainX, testX, trainY, testY = tts(X, Y, test_size=0.2, random_state = 42, shuffle=False)

In [8]:
#create architectures for DTC, LR, RFC
dtc = DTC().fit(trainX, trainY)
lr = LR().fit(trainX,trainY)
rfc = RFC().fit(trainX,trainY)
#check accuracy
print("DTC:",100* dtc.score(trainX,trainY), '%')
print("LR:",100* lr.score(trainX,trainY), '%')
print("RFC:",100* rfc.score(trainX,trainY), '%')

DTC: 100.0 %
LR: 51.2892202547375 %
RFC: 100.0 %


In [10]:
#test
print("Decision Tree Classifier:",100*dtc.score(testX,testY),'%')
print("Logistic Regression:",100*lr.score(testX,testY),'%')
print("Random Forest Classifier:",100*rfc.score(testX,testY),'%')

Decision Tree Classifier: 54.161490683229815 %
Logistic Regression: 54.78260869565217 %
Random Forest Classifier: 55.15527950310559 %
