#Description:
Use stock indicators with machine learning to predict the direction of a stock's price

In [1]:
# Import the libraries
import yfinance as yf
import numpy as np
import pandas as pd
from pandas_datareader import data as pdr
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
yf.pdr_override()

In [2]:
df = pdr.get_data_yahoo("GOOG", start="2011-02-11", end="2021-02-11")

[*********************100%***********************]  1 of 1 completed


In [3]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-02-11,305.748993,311.333038,305.355438,311.083984,311.083984,5199215
2011-02-14,310.506134,313.644379,308.862305,312.902161,312.902161,4271953
2011-02-15,312.488708,313.868530,310.386597,310.909637,310.909637,4199683
2011-02-16,311.646881,312.080261,309.888458,310.944519,310.944519,3380624
2011-02-17,309.465057,312.453857,308.981873,311.462555,311.462555,2968686
...,...,...,...,...,...,...
2021-02-04,2068.889893,2078.550049,2042.589966,2062.370117,2062.370117,1852300
2021-02-05,2070.000000,2102.510010,2059.330078,2098.000000,2098.000000,1533900
2021-02-08,2105.909912,2123.547119,2072.000000,2092.909912,2092.909912,1241900
2021-02-09,2078.540039,2105.129883,2078.540039,2083.510010,2083.510010,889900


In [4]:
# Create functions to calculation the Simple Moving Average (SMA) and the Exponential Moving Average (EMA)
# Typical time periods for MAs are 15, 20 and 30 days
# Create the SMA
def SMA(data, period=30, column="Close"):
  return data[column].rolling(window=period).mean()

# Create EMA
def EMA(data, period=20, column="Close"):
  return data[column].ewm(span=period, adjust=False).mean()

In [5]:
# Create a function to calculate the Moving Average Convergence/Divergence (MACD)
def MACD(data, period_long=26, period_short=12, period_signal=9, column="Close"):
  # Calculate Short Term EMA
  ShortEMA = EMA(data, period=period_short, column=column)
  # Calculate Long Term EMA
  LongEMA = EMA(data, period=period_long, column=column)
  # Calculate and store the MACD into dataframe
  data["MACD"] = ShortEMA - LongEMA
  # Calculate the signal line and store it into dataframe
  data["Signal_Line"] = EMA(data, period=period_signal, column="MACD")
  return data

In [6]:
# Create a function to calculate the Relative Strength Index (RSI)
def RSI(data, period=14, column="Close"):
  delta = data[column].diff(1)
  delta.dropna(inplace=True)
  up = delta.copy()
  down = delta.copy()
  up[up<0] = 0
  down[down>0] = 0
  data["Up"] = up
  data["Down"] = down
  AVG_Gain = SMA(data, period, column='Up')
  AVG_Loss = abs(SMA(data, period, column='Down'))
  RS = AVG_Gain / AVG_Loss
  RSI = 100.0 - (100.0 / (1.0 + RS))
  data["RSI"] = RSI
  return data

In [7]:
# Add the indicators to dataset
MACD(df)
RSI(df)
df["SMA"] = SMA(df)
df["EMA"] = EMA(df)

# Show data
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,Signal_Line,Up,Down,RSI,SMA,EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-02-11,305.748993,311.333038,305.355438,311.083984,311.083984,5199215,0.000000,0.000000,,,,,311.083984
2011-02-14,310.506134,313.644379,308.862305,312.902161,312.902161,4271953,0.145040,0.029008,1.818176,0.000000,,,311.257144
2011-02-15,312.488708,313.868530,310.386597,310.909637,310.909637,4199683,0.098074,0.042821,0.000000,-1.992523,,,311.224048
2011-02-16,311.646881,312.080261,309.888458,310.944519,310.944519,3380624,0.062943,0.046846,0.034882,0.000000,,,311.197426
2011-02-17,309.465057,312.453857,308.981873,311.462555,311.462555,2968686,0.076026,0.052682,0.518036,0.000000,,,311.222677
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-04,2068.889893,2078.550049,2042.589966,2062.370117,2062.370117,1852300,60.351636,39.038503,0.000000,-7.699951,77.943140,1818.051009,1879.803199
2021-02-05,2070.000000,2102.510010,2059.330078,2098.000000,2098.000000,1533900,70.076286,45.246060,35.629883,0.000000,79.746783,1830.534342,1900.583847
2021-02-08,2105.909912,2123.547119,2072.000000,2092.909912,2092.909912,1241900,76.490677,51.494983,0.000000,-5.090088,77.037789,1842.552006,1918.900615
2021-02-09,2078.540039,2105.129883,2078.540039,2083.510010,2083.510010,889900,79.894658,57.174918,0.000000,-9.399902,70.830427,1854.040674,1934.577700


In [8]:
# Create the Target column
df["Target"] = np.where(df["Close"].shift(-1) > df["Close"], 1, 0)
# Show data
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,Signal_Line,Up,Down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-02-11,305.748993,311.333038,305.355438,311.083984,311.083984,5199215,0.000000,0.000000,,,,,311.083984,1
2011-02-14,310.506134,313.644379,308.862305,312.902161,312.902161,4271953,0.145040,0.029008,1.818176,0.000000,,,311.257144,0
2011-02-15,312.488708,313.868530,310.386597,310.909637,310.909637,4199683,0.098074,0.042821,0.000000,-1.992523,,,311.224048,1
2011-02-16,311.646881,312.080261,309.888458,310.944519,310.944519,3380624,0.062943,0.046846,0.034882,0.000000,,,311.197426,1
2011-02-17,309.465057,312.453857,308.981873,311.462555,311.462555,2968686,0.076026,0.052682,0.518036,0.000000,,,311.222677,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-04,2068.889893,2078.550049,2042.589966,2062.370117,2062.370117,1852300,60.351636,39.038503,0.000000,-7.699951,77.943140,1818.051009,1879.803199,1
2021-02-05,2070.000000,2102.510010,2059.330078,2098.000000,2098.000000,1533900,70.076286,45.246060,35.629883,0.000000,79.746783,1830.534342,1900.583847,0
2021-02-08,2105.909912,2123.547119,2072.000000,2092.909912,2092.909912,1241900,76.490677,51.494983,0.000000,-5.090088,77.037789,1842.552006,1918.900615,0
2021-02-09,2078.540039,2105.129883,2078.540039,2083.510010,2083.510010,889900,79.894658,57.174918,0.000000,-9.399902,70.830427,1854.040674,1934.577700,1


In [9]:
# Remove the first 29 days of data
df = df[29:]
# Show data
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,Signal_Line,Up,Down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-03-25,292.344208,292.359161,288.538483,288.787567,288.787567,5738228,-4.688918,-5.671034,0.000000,-3.561646,41.880103,296.825924,291.294732,0
2011-03-28,289.948212,291.402740,286.281952,286.605743,286.605743,4453430,-4.502873,-5.437402,0.000000,-2.181824,39.012088,296.009982,290.848162,1
2011-03-29,286.924530,289.858551,285.435120,289.778839,289.778839,3221630,-4.052671,-5.160456,3.173096,0.000000,43.948895,295.239205,290.746321,1
2011-03-30,291.098877,291.656799,289.205994,289.833618,289.833618,2855262,-3.649396,-4.858244,0.054779,0.000000,51.075414,294.536671,290.659397,1
2011-03-31,290.411469,292.981842,289.783813,292.284454,292.284454,4074013,-3.096342,-4.505863,2.450836,0.000000,56.890198,293.914669,290.814165,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-04,2068.889893,2078.550049,2042.589966,2062.370117,2062.370117,1852300,60.351636,39.038503,0.000000,-7.699951,77.943140,1818.051009,1879.803199,1
2021-02-05,2070.000000,2102.510010,2059.330078,2098.000000,2098.000000,1533900,70.076286,45.246060,35.629883,0.000000,79.746783,1830.534342,1900.583847,0
2021-02-08,2105.909912,2123.547119,2072.000000,2092.909912,2092.909912,1241900,76.490677,51.494983,0.000000,-5.090088,77.037789,1842.552006,1918.900615,0
2021-02-09,2078.540039,2105.129883,2078.540039,2083.510010,2083.510010,889900,79.894658,57.174918,0.000000,-9.399902,70.830427,1854.040674,1934.577700,1


In [10]:
# Split dataset into features (X) and label (Y)
keep_columns = ["Close", "MACD", "Signal_Line", "RSI", "SMA", "EMA"]
X = df[keep_columns].values
Y = df["Target"].values

In [11]:
# Split data again into 80% training and 20% testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)

In [12]:
# Create and train decision tree classifier model
tree = DecisionTreeClassifier().fit(X_train, Y_train)

In [13]:
# Check score on training dataset
print(tree.score(X_train, Y_train))

1.0


In [14]:
# Check score on testing dataset
print(tree.score(X_test, Y_test))

0.4598393574297189


In [15]:
# Get model metrics
from sklearn.metrics import classification_report
print(classification_report(Y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.43      0.80      0.56       217
           1       0.56      0.20      0.29       281

    accuracy                           0.46       498
   macro avg       0.50      0.50      0.43       498
weighted avg       0.51      0.46      0.41       498

