In [1]:
import requests
import pandas as pd
import numpy as np
import yfinance as yf
import talib
import datetime as datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

**1.- Import prices from the Yahoo Finance API:**

In [2]:
import yfinance as yf
MSFT_yf = yf.Ticker('MSFT')
MSFT = MSFT_yf.history(period='max')
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13 00:00:00-05:00,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14 00:00:00-05:00,0.060657,0.063907,0.060657,0.062823,308160000,0.0,0.0
1986-03-17 00:00:00-05:00,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18 00:00:00-05:00,0.063907,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19 00:00:00-05:00,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


In [3]:
MSFT.index = pd.to_datetime(MSFT.index)

In [4]:
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13 00:00:00-05:00,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14 00:00:00-05:00,0.060657,0.063907,0.060657,0.062823,308160000,0.0,0.0
1986-03-17 00:00:00-05:00,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18 00:00:00-05:00,0.063907,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19 00:00:00-05:00,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


In [5]:
MSFT.index = MSFT.index.strftime('%Y-%m-%d')
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14,0.060657,0.063907,0.060657,0.062823,308160000,0.0,0.0
1986-03-17,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18,0.063907,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


In [6]:
MSFT.index = pd.to_datetime(MSFT.index)

**2.- Obtain the target variable**

First we are going to create the target variable. As we are going to study the prediction from a categorical point of view, we are going to build a column where 1 means that the closing price is higher than the open price and 0 means a decrease in price. 

To start with, we are going to create a new column that calculates the difference between the close and open price:

In [7]:
MSFT['Daily change'] = MSFT['Close'] - MSFT['Open']

Now we are going to convert the column to a binary variable:

In [8]:
def func(x):
    if x > 0:
        return 1
    elif x < 0:
        return 0
    

MSFT['Daily change']  = MSFT['Daily change'] .apply(func)
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1986-03-13,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0,1.0
1986-03-14,0.060657,0.063907,0.060657,0.062823,308160000,0.0,0.0,1.0
1986-03-17,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0,1.0
1986-03-18,0.063907,0.064448,0.06174,0.062281,67766400,0.0,0.0,0.0
1986-03-19,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0,0.0


Now we are going to build a new DataFrame where we are going to initially start with the target variable (Daily change), one independent variable (yesterdays's daily change) and continuously add one by one variable to test the accuracy of the model:

In [9]:
MSFT ['Yesterday daily change'] = MSFT ['Daily change'].shift(1)

In [10]:
df = MSFT [['Daily change', 'Yesterday daily change']]

**3.- Clean the dataset**

Next, we need to **remove the NaN values**, because most ML models do not work with NaN values:

In [11]:
df = df[1:]

In [12]:
df = df.fillna(0)

**4.- Training and evaluation of the model**

Performing a train test split is not accurate enough to test the performance of Machine Learning models. A more accurate procedure is to perform **Cross-Validation**.

Cross-Validation on time series is not performed as on standard cases. There are two main methods to perform Cross-Validation on time series: time series split and blocked cross-validation. **Time series split** starts with a small train gap and small test gap and increases the train gap until all the dataset is evaluated. On the other side, **blocked cross-validation** uses the same size of train and test and rolls the window to the next subset of the dataset.

We are now going to use blocked cross-validation because the windows do not overlap in time and the dataset is big enough to use this technique. 

In [13]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

We now **split the data into train and test**. The split number will help us take 75% of the data for the training and the remaining 25% for the test subset. The new matrix X is create to **consider only the independent variables**. If we would consider all the columns (including the target variable) the accuracy would be 100% because the model would be capable of knowing the correct answer beforehand. The **target variable is the 'Daily change' column.**

In [14]:
# Training dataset length
split = int(len(df) * 0.75)

# Splitting the X and y into train and test datasets
X = df.drop(['Daily change'], axis=1)
X_train = X[:split]
X_test = X[split:]
y_train = df['Daily change'][:split]
y_test = df['Daily change'][split:]

Next, we are going to perform cross-validation and evalate the mean and standard deviation of the accuracies of the splitted fragments.

In [15]:
model = DecisionTreeClassifier(random_state=12)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=btscv, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

Loss: 0.490 (+/- 0.023)


We finally apply adnt evaluate the model on the test subset.

In [16]:
clf = DecisionTreeClassifier(random_state=12)
model = clf.fit(X_train, y_train)
round(accuracy_score(y_test, model.predict(X_test), normalize=True)*100, 3)

47.179

As we can see, **if yesterday the price change was positive or negative does not offer any predictive power**.

**5.- Model improvement**

Now we are going to train more models based on price and other variables we will add.

**5.1.- Multi-category price change**

This model consists of having the price change rounded as an integer

In [17]:
df1 = pd.DataFrame()
df1 ['Daily change'] = df['Daily change']

In [18]:
df1 ['Yesterday daily change'] = round(((MSFT['Close']-MSFT['Open'])/MSFT['Close'])*100, 0).shift(1)

In [19]:
df1.head()

Unnamed: 0_level_0,Daily change,Yesterday daily change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1986-03-14,1.0,9.0
1986-03-17,1.0,3.0
1986-03-18,0.0,2.0
1986-03-19,0.0,-3.0
1986-03-20,0.0,-2.0


In [20]:
X1 = df1.drop(['Daily change'], axis=1)
X1_train = X1[:split]
X1_test = X1[split:]
y1_train = df1['Daily change'][:split]
y1_test = df1['Daily change'][split:]

In [21]:
model = DecisionTreeClassifier(random_state=12)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X1_train, y1_train, cv=btscv, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

Loss: 0.495 (+/- 0.021)


In [22]:
clf = DecisionTreeClassifier(random_state=12)
model = clf.fit(X1_train, y1_train)
round(accuracy_score(y1_test, model.predict(X1_test), normalize=True)*100, 3)

48.162

As we can see, using yesterday's percentage change is not useful to forecast if today is going to be a positive or negative day.

**5.2.- Shadows of candles**

In [23]:
df2 = df1

In [24]:
df2 ['Yest_upper_shadow1'] = ((MSFT['High'] - MSFT['Close']) / MSFT['Close'])*100
df2 ['Yest_upper_shadow2'] = ((MSFT['High'] - MSFT['Open']) / MSFT['Close'])*100
df2 ['Yest_lower_shadow1'] = ((MSFT['Close'] - MSFT['Low']) / MSFT['Close'])*100
df2 ['Yest_lower_shadow2'] = ((MSFT['Open'] - MSFT['Low']) / MSFT['Close'])*100
df2 ['Yest_upper_shadow'] = df2 [['Yest_upper_shadow1', 'Yest_upper_shadow2']].min(axis=1)
df2 ['Yest_lower_shadow'] = df2 [['Yest_lower_shadow1', 'Yest_lower_shadow2']].min(axis=1)

In [25]:
df2.drop(['Yest_upper_shadow1', 'Yest_upper_shadow2','Yest_lower_shadow1', 'Yest_lower_shadow2'], axis=1, inplace=True)
df2.head()

Unnamed: 0_level_0,Daily change,Yesterday daily change,Yest_upper_shadow,Yest_lower_shadow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986-03-14,1.0,9.0,1.725027,0.0
1986-03-17,1.0,3.0,0.8474,0.0
1986-03-18,0.0,2.0,0.869513,0.869513
1986-03-19,0.0,-3.0,0.884902,0.884902
1986-03-20,0.0,-2.0,0.0,0.909034


In [26]:
X2 = df2.drop(['Daily change'], axis=1)
X2_train = X2[:split]
X2_test = X2[split:]
y2_train = df2['Daily change'][:split]
y2_test = df2['Daily change'][split:]

In [27]:
model = DecisionTreeClassifier(random_state=12)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X2_train, y2_train, cv=btscv, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

Loss: 0.520 (+/- 0.030)


In [28]:
clf = DecisionTreeClassifier(random_state=12)
model = clf.fit(X2_train, y2_train)
round(accuracy_score(y2_test, model.predict(X2_test), normalize=True)*100, 3)

50.769

As we can see, the improvement is very small. 

**5.3.- Previous 3 days**

In this model, we are going to add the change of the previous 3 days. If the price has been very negative, it may be likely that a price reversal may happen:

In [29]:
df3 = df2

In [30]:
df3 ['D-2 daily change'] = df3['Yesterday daily change'].shift(1)

In [31]:
df3 ['D-3 daily change'] = df3['Yesterday daily change'].shift(2)

As many Machine Learning models do not work with NA values, let's fill the two NaNs with 0:

In [32]:
df3 = df3.fillna(0)

In [33]:
df3.head()

Unnamed: 0_level_0,Daily change,Yesterday daily change,Yest_upper_shadow,Yest_lower_shadow,D-2 daily change,D-3 daily change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1986-03-14,1.0,9.0,1.725027,0.0,0.0,0.0
1986-03-17,1.0,3.0,0.8474,0.0,9.0,0.0
1986-03-18,0.0,2.0,0.869513,0.869513,3.0,9.0
1986-03-19,0.0,-3.0,0.884902,0.884902,2.0,3.0
1986-03-20,0.0,-2.0,0.0,0.909034,-3.0,2.0


In [34]:
X3 = df3.drop(['Daily change'], axis=1)
X3_train = X3[:split]
X3_test = X3[split:]
y3_train = df3['Daily change'][:split]
y3_test = df3['Daily change'][split:]

In [35]:
model3 = DecisionTreeClassifier(random_state=12)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores3 = cross_val_score(model3, X3_train, y3_train, cv=btscv, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores3.mean(), scores3.std()))

Loss: 0.505 (+/- 0.036)


In [36]:
clf3 = DecisionTreeClassifier(random_state=12)
model3 = clf3.fit(X3_train, y3_train)
round(accuracy_score(y3_test, model3.predict(X3_test), normalize=True)*100, 3)

49.872

**5.6.- Oil price**

To start with, we import oil price from Yahoo! Finance API:

In [37]:
oil_price = yf.download('CL=F', period='max')

[*********************100%***********************]  1 of 1 completed


We build a DataFrame as a copy of the previous DataFrame and add oil price as the difference between yesterday's close and open:

In [38]:
df6_2 = df3.copy()

In [39]:
df6_2 ['oil_price'] = oil_price ['Close']

In [40]:
df6_2 ['yest_oil_diff'] = ((oil_price ['Close'].shift(1) - oil_price ['Open'].shift(1))/(oil_price['Close'].shift(1)))*100

There are 21 observations where the oil price and the difference are NaN. As the oil price may have no prediction power by itself, let's assign the previous day's close price. As there are very little missing values for yesterday's oil price difference, let's assign a value of 0:

In [41]:
df6_2['oil_price'].fillna(df6_2['oil_price'].shift(1), inplace=True)

In [42]:
df6_2['yest_oil_diff'].fillna(0, inplace=True)

As the data available for oil price is much shorter than the available data for Microsoft stock price, let's remove the observations of the DataFrame that contain NaN:

In [43]:
df6_2.dropna(inplace=True)

Now are ready to split the train and test subdatasets to prepare the data for the modelling:

In [44]:
# Training dataset length
split6_2 = int(len(df6_2) * 0.75)

# Splitting the X and y into train and test datasets
X6_2 = df6_2.drop(['Daily change'], axis=1)
X6_2train = X6_2[:split6_2]
X6_2test = X6_2[split6_2:]
y6_2train = df6_2['Daily change'][:split6_2]
y6_2test = df6_2['Daily change'][split6_2:]

In [45]:
model6_2 = DecisionTreeClassifier(random_state=12)
btscv6_2 = BlockingTimeSeriesSplit(n_splits=5)
scores6_2 = cross_val_score(model6_2, X6_2train, y6_2train, cv=btscv6_2, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores6_2.mean(), scores6_2.std()))

Loss: 0.524 (+/- 0.047)


In [46]:
clf6_2 = DecisionTreeClassifier(random_state=12)
model6_2 = clf6_2.fit(X6_2train, y6_2train)
round(accuracy_score(y6_2test, model6_2.predict(X6_2test), normalize=True)*100, 3)

50.175

As we can see, the improvement of the model is unappreciable.

Disclamer: The information on this notebook is intended for personal use only and does not represent any investment advice or recommendation of any form.