In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the lib
from glob import glob
import os
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
DATA_DIRECTORY='../data/forecasting'

In [None]:
plt.style.use('fivethirtyeight')

# Forecasting time series with ML - as a regression problem
## Check original dataset for NY exchange prices

In [None]:
# Load the New York stock exchange prices 
prices = pd.read_csv('{}/prices.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)

In [None]:
prices.head()

In [None]:
# Plot and show the time series on axis ax1
fig, ax1 = plt.subplots()
prices['close'].plot(ax=ax1, figsize=(12,10))
plt.title('New York stock prices change')
plt.xlabel('Date')
plt.ylabel('Stock prices')
plt.show()

In [None]:
# print the type of the data
prices.dtypes

In [None]:
prices.index = pd.to_datetime(prices.index)

## Load preprocessed prices and fit a regression model

If you want to predict patterns from data over time, there are special considerations to take in how you choose and construct your model. This section covers how to gain insights into the data before fitting your model, as well as best practices in using predictive modeling for time series data.

We will deal with stock market prices that fluctuate over time. In this section we 've got historical prices from two tech companies (Ebay and Yahoo) in the DataFrame prices

In [None]:
# Load the data
preprocessed_prices = pd.read_csv('{}/preprocessed_prices.csv'.format(DATA_DIRECTORY), parse_dates=True, index_col='date')

In [None]:
preprocessed_prices.head(5)

In [None]:
# Plot the raw values over time
preprocessed_prices.plot(y=['YHOO','EBAY'])
plt.title('Market stock change for Yahoo and Ebay')
plt.ylabel('Stock price')
plt.xlabel('Time')
plt.show()

In [None]:
# Scatterplot with one company per axis
preprocessed_prices.plot.scatter('EBAY', 'YHOO')
plt.title('Scatter plot of Yahoo and Ebay')
plt.show()

Finally, encode time as the color of each datapoint in order to visualize how the relationship between these two variables changes.




In [None]:
# Scatterplot with color relating to time
preprocessed_prices.plot.scatter('EBAY', 'YHOO', c=preprocessed_prices.index, 
                    cmap=plt.cm.viridis, colorbar=True, figsize=(10,8))

plt.title('Time color coded scatter plot of Yahoo and ebay')
plt.show()

In [None]:
# Use stock symbols to extract training data
X = preprocessed_prices[['EBAY', 'NVDA', 'YHOO']]
y = preprocessed_prices[['AAPL']]

In [None]:
X.head(3)

In [None]:
y.head(3)

## 3.2. Cross-validating time-series data


In [None]:
print("We have a total of {} samples".format(len(X.index)))

In [None]:
# Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit

# Create a time-series cross-validation object
cv = TimeSeriesSplit(n_splits=10)

# Iterate through CV splits
fig, ax = plt.subplots()
for ii, (tr, tt) in enumerate(cv.split(X, y)):
    # Plot the training data on each iteration to see the behavior of the CV
    ax.plot(tr, ii + y.iloc[tr]/1000)
    
ax.set(title='Training data on each CV iteration', ylabel='CV iteration')
ax.set(xlabel='time')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Keep at most the last 70 data points in the training data
window = 70

# Initialize the CV with this window size
cv = TimeSeriesSplit(n_splits=10, max_train_size=window)

fold=1
for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    model = RandomForestRegressor()
    model.fit(X.iloc[rows_test], y.iloc[rows_test])

    # Generate predictions on the test data, score the predictions, and print them
    prediction = model.predict(X.iloc[rows_test])
    score = round(mean_absolute_error(y.iloc[rows_test], prediction), 4)
    print("MAE for fold {} is : {}".format(fold, score))
    fold = fold+1