In [None]:
# Description: Use stock indicators with machine learning to try to predict the direction of a stocks price

In [None]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from sklearn.model_selection import train_test_split

In [None]:
!pip install yfinance



In [None]:
# Load the data set
import yfinance as yf
df = yf.download("GOOG", start="2019-06-03", end="2019-12-17")


[*********************100%***********************]  1 of 1 completed


In [None]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,1036.229980,5130600
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,1053.050049,2833500
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,1042.219971,2168400
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1044.339966,1703200
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1066.040039,1802400
...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1344.660034,1094100
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,1345.020020,850400
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1350.270020,1281000
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1347.829956,1549600


In [None]:
#Create functions to calculate teh Simpple Moving Average (SMA) & the Exponential Moving Average(EMA)
# Typical time periods for moving averages are 15, 20, & 30
# Create the Simple Moving Average (EMA)
def SMA(data, period=30, column='Close'):
  return data[column].rolling(window=period).mean()

# Create teh Exponential Moving Average (EMA)
def EMA( data, period=20, column='Close'):
  return data[column].ewm(span=period, adjust=False).mean()

In [None]:
# Create a function to calculate the Moving Average Convergence / Divergence(MACD)

def MACD(data, period_long=26, period_short=21, period_signal=9, column='Close'):
  #calculate the Short Term EMA
  ShortEMA= EMA(data, period=period_short, column=column)
  # Calculate the Long Term EMA
  LongEMA=EMA(data, period=period_long, column=column)
  #Calculate the store the MACD into the data frame
  data['MACD']=ShortEMA - LongEMA
  #Calculate the signal line and store it into the data frame
  data['signal_Line']=EMA(data, period=period_signal, column='MACD')
  return data

In [None]:
# Create a function to calculate the Relative Strength Index(RSI)

def  RSI( data,  period=14, column='Close'):
  delta= data[column].diff(1)
  delta=delta.dropna()
  up= delta.copy()
  down=delta.copy()
  up[up<0] = 0
  down[down > 0] = 0
  data['up']=up
  data['down']=down
  AVG_Gain=SMA(data, period, column='up')
  AVG_Loss= abs(SMA(data, period, column='down'))
  RS=AVG_Gain/ AVG_Loss
  RSI=100.0-(100.0/(1.0+RS))

  data['RSI']= RSI
  return data

In [None]:
# Add the indicators to the data set
MACD(df)
RSI(df)
df['SMA']=SMA(df)
df['EMA']=EMA(df)
#Show the data
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,signal_Line,up,down,RSI,SMA,EMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,1036.229980,5130600,0.000000,0.000000,,,,,1036.229980
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,1053.050049,2833500,0.283166,0.056633,16.820068,0.000000,,,1037.831892
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,1042.219971,2168400,0.337290,0.112765,0.000000,-10.830078,,,1038.249804
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1044.339966,1703200,0.416268,0.173465,2.119995,0.000000,,,1038.829819
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1066.040039,1802400,0.845265,0.307825,21.700073,0.000000,,,1041.421269
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1344.660034,1094100,5.462580,5.198051,1.099976,0.000000,62.510735,1304.410335,1313.003203
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,1345.020020,850400,5.615473,5.281535,0.359985,0.000000,70.052514,1307.157003,1316.052424
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1350.270020,1281000,5.794740,5.384176,5.250000,0.000000,72.606290,1310.123002,1319.311243
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1347.829956,1549600,5.865538,5.480448,0.000000,-2.440063,75.083593,1313.047001,1322.027311


In [None]:
# Create the Target Column
df['Target']= np.where(df['Close'].shift(-1) > df['Close'],1, 0)

# Show the data
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,signal_Line,up,down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-06-03,1065.500000,1065.500000,1025.000000,1036.229980,1036.229980,5130600,0.000000,0.000000,,,,,1036.229980,1
2019-06-04,1042.900024,1056.050049,1033.689941,1053.050049,1053.050049,2833500,0.283166,0.056633,16.820068,0.000000,,,1037.831892,0
2019-06-05,1051.540039,1053.550049,1030.489990,1042.219971,1042.219971,2168400,0.337290,0.112765,0.000000,-10.830078,,,1038.249804,1
2019-06-06,1044.989990,1047.489990,1033.699951,1044.339966,1044.339966,1703200,0.416268,0.173465,2.119995,0.000000,,,1038.829819,1
2019-06-07,1050.630005,1070.920044,1048.400024,1066.040039,1066.040039,1802400,0.845265,0.307825,21.700073,0.000000,,,1041.421269,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1344.660034,1094100,5.462580,5.198051,1.099976,0.000000,62.510735,1304.410335,1313.003203,1
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,1345.020020,850400,5.615473,5.281535,0.359985,0.000000,70.052514,1307.157003,1316.052424,1
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1350.270020,1281000,5.794740,5.384176,5.250000,0.000000,72.606290,1310.123002,1319.311243,0
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1347.829956,1549600,5.865538,5.480448,0.000000,-2.440063,75.083593,1313.047001,1322.027311,1


In [None]:
# Remove the first 29 days of data
df=df[29:]


In [None]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MACD,signal_Line,up,down,RSI,SMA,EMA,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-07-15,1146.859985,1150.819946,1139.400024,1150.339966,1150.339966,903800,6.344890,5.471322,5.439941,0.000000,62.063456,1096.864327,1113.511647,1
2019-07-16,1146.000000,1158.579956,1145.000000,1153.579956,1153.579956,1238800,6.572656,5.691589,3.239990,0.000000,78.393463,1100.775993,1117.327677,0
2019-07-17,1150.969971,1158.359985,1145.770020,1146.349976,1146.349976,1170000,6.598400,5.872951,0.000000,-7.229980,77.945728,1103.885990,1120.091705,0
2019-07-18,1141.739990,1147.604980,1132.729980,1146.329956,1146.329956,1290700,6.575300,6.013421,0.000000,-0.020020,80.494359,1107.356323,1122.590586,0
2019-07-19,1148.189941,1151.140015,1129.619995,1130.099976,1130.099976,1647200,6.238346,6.058406,0.000000,-16.229980,69.422722,1110.214990,1123.305766,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-10,1341.500000,1349.974976,1336.040039,1344.660034,1344.660034,1094100,5.462580,5.198051,1.099976,0.000000,62.510735,1304.410335,1313.003203,1
2019-12-11,1350.839966,1351.199951,1342.670044,1345.020020,1345.020020,850400,5.615473,5.281535,0.359985,0.000000,70.052514,1307.157003,1316.052424,1
2019-12-12,1345.939941,1355.775024,1340.500000,1350.270020,1350.270020,1281000,5.794740,5.384176,5.250000,0.000000,72.606290,1310.123002,1319.311243,0
2019-12-13,1347.949951,1353.093018,1343.869995,1347.829956,1347.829956,1549600,5.865538,5.480448,0.000000,-2.440063,75.083593,1313.047001,1322.027311,1


In [None]:
# Split the data set into feature or independent data set(X) and a Target or dependent data set(Y)
keep_columns=['Close', 'MACD', 'signal_Line','RSI','SMA', 'EMA']
X=df[keep_columns].values
Y=df['Target'].values

In [None]:
# Split the data again but this time into 80% training and 20% testing data sets
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.2, random_state=2)

In [None]:
from sklearn import tree

In [None]:
# Create and train the decision tree classifier model
tree= tree.DecisionTreeClassifier().fit(X_train,Y_train)

In [None]:
#Check how well the model did on the training data set
print(tree.score(X_train, Y_train))

1.0


In [None]:
#Check how well the model did on the testing data set
print(tree.score(X_test,Y_test))

0.5909090909090909


In [None]:
# Show the model tree predictions
tree_predictions=tree.predict(X_test)
print(tree_predictions)

[0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0]


In [None]:
#Show the actaul values from the test data
Y_test

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1])

In [None]:
# Get the model metrics
from sklearn.metrics import classification_report
print(classification_report(Y_test, tree_predictions))

              precision    recall  f1-score   support

           0       0.83      0.38      0.53        13
           1       0.50      0.89      0.64         9

    accuracy                           0.59        22
   macro avg       0.67      0.64      0.58        22
weighted avg       0.70      0.59      0.57        22

