In [28]:
import requests
import pandas as pd
import numpy as np
import yfinance as yf
import datetime as datetime
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

**1.- Import prices from the Yahoo! Finance API:**

We can retrieve daily price values using the Yahoo! Finance API:

In [2]:
MSFT_yf = yf.Ticker('MSFT')
MSFT = MSFT_yf.history(period='max')
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13 00:00:00-05:00,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14 00:00:00-05:00,0.060657,0.063906,0.060657,0.062823,308160000,0.0,0.0
1986-03-17 00:00:00-05:00,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18 00:00:00-05:00,0.063906,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19 00:00:00-05:00,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


Next we are going to change the index of the DataFrame into a datetime format that can be used to build diagrams with matplotlib:

In [7]:
MSFT.index = pd.to_datetime(MSFT.index)

In [8]:
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14,0.060657,0.063906,0.060657,0.062823,308160000,0.0,0.0
1986-03-17,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18,0.063906,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


In [9]:
MSFT.index = MSFT.index.strftime('%Y-%m-%d')
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1986-03-13,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0
1986-03-14,0.060657,0.063906,0.060657,0.062823,308160000,0.0,0.0
1986-03-17,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0
1986-03-18,0.063906,0.064448,0.06174,0.062281,67766400,0.0,0.0
1986-03-19,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0


In [10]:
MSFT.index = pd.to_datetime(MSFT.index)

**2.- Obtain the target variable**

First we are going to create the target variable. As we are going to study the prediction from a categorical point of view, we are going to build a column where 1 means that the closing price is higher than the open price and 0 means a decrease in price. 

To start with, we are going to create a new column that calculates the difference between the close and open price:

In [11]:
MSFT['Daily change'] = MSFT['Close'] - MSFT['Open']

Now we are going to convert the column to a binary variable:

In [12]:
def func(x):
    if x > 0:
        return 1
    elif x < 0:
        return 0
    

MSFT['Daily change']  = MSFT['Daily change'] .apply(func)
MSFT.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1986-03-13,0.055241,0.063365,0.055241,0.060657,1031788800,0.0,0.0,1.0
1986-03-14,0.060657,0.063906,0.060657,0.062823,308160000,0.0,0.0,1.0
1986-03-17,0.062823,0.064448,0.062823,0.063907,133171200,0.0,0.0,1.0
1986-03-18,0.063906,0.064448,0.06174,0.062281,67766400,0.0,0.0,0.0
1986-03-19,0.062281,0.062823,0.060657,0.061198,47894400,0.0,0.0,0.0


Now we are going to build a new DataFrame where we are going to initially start with the target variable (Daily change), one independent variable (yesterdays's daily change) and continuously add one by one variable to test the accuracy of the model:

In [13]:
MSFT ['Yesterday daily change'] = MSFT ['Daily change'].shift(1)

In [14]:
df = MSFT [['Daily change', 'Yesterday daily change']]

**3.- Clean the dataset**

Next, we need to **remove the NaN values**, because most ML models do not work with NaN values:

In [15]:
df = df.dropna()

In [16]:
df.head()

Unnamed: 0_level_0,Daily change,Yesterday daily change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
1986-03-14,1.0,1.0
1986-03-17,1.0,1.0
1986-03-18,0.0,1.0
1986-03-19,0.0,0.0
1986-03-20,0.0,0.0


**4.- Training and evaluation of the model**

Performing a train test split is not accurate enough to test the performance of Machine Learning models. A more accurate procedure is to perform **Cross-Validation**.

Cross-Validation on time series is not performed as on standard cases. There are two main methods to perform Cross-Validation on time series: time series split and blocked cross-validation. **Time series split** starts with a small train gap and small test gap and increases the train gap until all the dataset is evaluated. On the other side, **blocked cross-validation** uses the same size of train and test and rolls the window to the next subset of the dataset.

We are now going to use blocked cross-validation because the windows do not overlap in time and the dataset is big enough to use this technique. 

To apply this technique, we are going to create the following function:

In [18]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

We now **split the data into train and test**. The split number will help us take 75% of the data for the training and the remaining 25% for the test subset. The new matrix X is create to **consider only the independent variables**. If we would consider all the columns (including the target variable) the accuracy would be 100% because the model would be capable of knowing the correct answer beforehand. The **target variable is the 'Daily change' column.**

In [19]:
# Training dataset length
split = int(len(df) * 0.75)

# Splittiing the X and y into train and test datasets
X = df.drop(['Daily change'], axis=1)
X_train = X[:split]
X_test = X[split:]
y_train = df['Daily change'][:split]
y_test = df['Daily change'][split:]

Next, we are going to perform cross-validation and evalate the mean and standard deviation of the accuracies of the splitted fragments.

In [26]:
model = DecisionTreeClassifier(random_state=12)
btscv = BlockingTimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X_train, y_train, cv=btscv, scoring='accuracy')
print("Loss: {0:.3f} (+/- {1:.3f})".format(scores.mean(), scores.std()))

Loss: 0.515 (+/- 0.041)


We finally apply and evaluate the model on the test subset.

In [29]:
clf = DecisionTreeClassifier(random_state=12)
model = clf.fit(X_train, y_train)
round(accuracy_score(y_test, model.predict(X_test), normalize=True)*100, 3)

52.002

As we can see, **if yesterday the price change was positive or negative does not offer any predictive power**.

Disclamer: The information on this notebook is intended for personal use only and does not represent any investment advice or recommendation of any form.