In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the lib
from glob import glob
import os
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt

In [None]:
DATA_DIRECTORY='../data/forecasting'

In [None]:
plt.style.use('fivethirtyeight')

# 1. Introduction to Time Series and Machine Learning

## 1.2. Machine learning and time-series data

In [None]:
# Load the New York stock exchange prices 
prices = pd.read_csv('{}/prices.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)

In [None]:
prices.head()

In [None]:
# Plot and show the time series on axis ax1
fig, ax1 = plt.subplots()
prices['close'].plot(ax=ax1, figsize=(12,10))
plt.title('New York stock prices change')
plt.xlabel('Date')
plt.ylabel('Stock prices')
plt.show()

In [None]:
# print the type of the data
prices.dtypes

In [None]:
prices.index = pd.to_datetime(prices.index)

# 2. Time Series Forecasting with Machine Learning

If you want to predict patterns from data over time, there are special considerations to take in how you choose and construct your model. This section covers how to gain insights into the data before fitting your model, as well as best practices in using predictive modeling for time series data.

## 2.1. Predicting data over time

We will deal with stock market prices that fluctuate over time. In this section we 've got historical prices from two tech companies (Ebay and Yahoo) in the DataFrame prices

In [None]:
# Load the data
preprocessed_prices = pd.read_csv('{}/preprocessed_prices.csv'.format(DATA_DIRECTORY), parse_dates=True, index_col='date')

In [None]:
preprocessed_prices.head(5)

In [None]:
# Plot the raw values over time
preprocessed_prices.plot(y=['YHOO','EBAY'])
plt.title('Market stock change for Yahoo and Ebay')
plt.ylabel('Stock price')
plt.xlabel('Time')
plt.show()

In [None]:
# Scatterplot with one company per axis
preprocessed_prices.plot.scatter('EBAY', 'YHOO')
plt.title('Scatter plot of Yahoo and Ebay')
plt.show()

Finally, encode time as the color of each datapoint in order to visualize how the relationship between these two variables changes.




In [None]:
# Scatterplot with color relating to time
preprocessed_prices.plot.scatter('EBAY', 'YHOO', c=preprocessed_prices.index, 
                    cmap=plt.cm.viridis, colorbar=True, figsize=(10,8))

plt.title('Time color coded scatter plot of Yahoo and ebay')
plt.show()

Now we will fit a linear regression, we will use the eBay, Nvidia and Yahoo stock prices as the features and the target value will be Apple stock price. We will use the linear regression model for predicition

In [None]:
#Fitting a simple regression model
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# Use stock symbols to extract training data
X = preprocessed_prices[['EBAY', 'NVDA', 'YHOO']]
y = preprocessed_prices[['AAPL']]
# Fit and score the model with cross-validation
scores = cross_val_score(Ridge(), X, y, cv=3, scoring='r2')

In [None]:
# We get three R2 scores - one for each run of the crossvalidation.
# For further details on the R2 score, please check: https://en.wikipedia.org/wiki/Coefficient_of_determination
np.round(scores, 4)

In [None]:
# Visualizing predicted values
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, shuffle=False, random_state=1)

# Fit the model and generate predictions
model = Ridge()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = r2_score(y_test, predictions)

In [None]:
print("R2 score: {}".format(round(score, 4)))

In [None]:
y_test['predictions'] = predictions
y_test.rename(columns= {'AAPL':'True_vlaue'}, inplace=True)

In [None]:
# Visualize the predictions along with the "true" values, and print the score
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(y_test['True_vlaue'], color='k', lw=3)
ax.plot(y_test['predictions'], color='r', lw=2)
ax.legend(['True values', 'Predicitions'])
plt.title('Apple stock price true value and predicted price')
plt.show()

Now you have an explanation for your poor score. The predictions clearly deviate from the true time series values.

In [None]:
from matplotlib import cm
alphas = [.1, 1e2, 1e3]

fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(y_test['True_vlaue'], color='k', alpha=.3, lw=3)
color = ['r','b','g']
for ii, alpha in enumerate(alphas):
    y_test['predictions'] = Ridge(alpha=alpha).fit(X_train, y_train).predict(X_test)
    ax.plot(y_test['predictions'],color[ii])
ax.legend(['True values', 'Model 1', 'Model 2', 'Model 3'])
ax.set(xlabel="Time")
ax.set(ylabel="stock price")

## 2.2. Advanced time series forecatsing

We will use AIG company data, first we will drop some of the rows to act as missed data and then deal with it 

In [None]:
# Create missing rows at random 
def remove_n_consecutive_rows(frame, n, percent):
    chunks_to_remove = int(percent/100*frame.shape[0]/n)
    #split the indices into chunks of length n+2
    chunks = [list(range(i,i+n+2)) for i in range(0, frame.shape[0]-n)]
    drop_indices = list()
    for i in range(chunks_to_remove):
        indices = random.choice(chunks)
        drop_indices+=indices[1:-1]
        #remove all chunks which contain overlapping values with indices
        chunks = [c for c in chunks if not any(n in indices for n in c)]
    #drop_indices = frame.index[drop_indices]    
    frame.iloc[drop_indices,] = np.nan
    return frame

In [None]:
# the AIG data without missing values 
AIG = pd.DataFrame(preprocessed_prices['AIG'])
AIG.plot(figsize=(8,8))

plt.title('The stock prices of the AIG company')
plt.xlabel('Time (Years)')
plt.ylabel('Stock price')

In [None]:
# plotting the missing data
AIG_missing_data = remove_n_consecutive_rows(AIG, 100, 20)

AIG_missing_data.plot(figsize=(8,8))
plt.title('The stock prices of the AIG company with missing data')
plt.xlabel('Time (Years)')
plt.ylabel('Stock price')

Lets now interpolate the missing data

In [None]:
# Interpolation in Pandas

# Return a boolean that notes where missing values are
missing_index = AIG_missing_data.isna()

# Interpolate linearly within missing windows
AIG_interp = AIG_missing_data.interpolate('linear')

In [None]:
# Plot the interpolated data in red and the data w/ missing values in black
ax = AIG_interp.plot(c='r')
AIG_missing_data.plot(c='k',ax=ax, lw=2)
ax.legend(['AIG_missing','AIG_interpolated'])
ax.set(xlabel='Time (Years)')
ax.set(ylabel='Stock value')
ax.set(title='AIG stock price value with missing and interpolated values')

In [None]:
#Transforming to percent change with Pandas

def percent_change(values):
    """Calculates the % change between the last value
    and the mean of previous values"""
    # Separate the last value and all previous values into variables
    previous_values = values[:-1]
    last_value = values[-1]
    # Calculate the % difference between the last value
    # and the mean of earlier values
    percent_change = (last_value - np.mean(previous_values)) \
    / np.mean(previous_values)
    return percent_change

In [None]:
# Applying the transformation to our data

# Plot the raw data
fig, axs = plt.subplots(1, 2, figsize=(20, 10))

# plot the AIG with interplotation
axs[0].plot(AIG_interp, label='AIG')
axs[0].legend()

# Calculate % change and plot
AIG_perc_change = AIG_interp.rolling(window=20).aggregate(percent_change)

# plot the trasnfoemd AIG
axs[1].plot(AIG_perc_change,label= 'AIG_transfomred')
axs[1].legend()

# set the title and x-axis and y-axis labels
axs[0].set(xlabel='Time (Years)')
axs[1].set(xlabel='Time (Years)')
axs[0].set(ylabel='Stock market value')
axs[1].set(ylabel='Percentage chnage in the stock market value')
plt.suptitle('AIG stock prices vs percentage change in stock prices')


#### Finding outliers
Outliers are data points that are statistically different from the dataset as a whole. A common definition is any data point that is more than three standard deviations away from the mean of the dataset.

In [None]:
# Plotting a threshold on our data

fig, axs = plt.subplots(1, 2, figsize=(20, 10))
legends = ['AIG','AIG_transfromed']
for data, ax, l in zip([AIG_interp, AIG_perc_change], axs, legends):
    # Calculate the mean / standard deviation for the data
    data_mean = data.mean()
    data_std = data.std()
    # Plot the data, with a window that is 3 standard deviations
    # around the mean
    ax.plot(data, label=l)
    ax.legend()
    ax.axhline(data_mean[0] + data_std[0] * 3, ls='--', c='r')
    ax.axhline(data_mean[0] - data_std[0] * 3, ls='--', c='r')

# set the title and x-axis and y-axis labels
axs[0].set(xlabel='Time (Years)')
axs[1].set(xlabel='Time (Years)')
axs[0].set(ylabel='Stock market value')
axs[1].set(ylabel='Percentage chnage in the stock market value')
plt.suptitle('AIG stock prices vs percentage change in stock prices with applied threshold')
    

Lets replace the outliers using the threshold


In [None]:
# Center the data so the mean is 0
AIGِ_outlier_centered = AIG_perc_change - AIG_perc_change.mean()

# Calculate the standard deviation
std = AIG_perc_change.std()

# Use the absolute value of each data point
# to make it easier to find outliers
outliers = np.abs(AIGِ_outlier_centered) > (std * 3)

# Replace outliers with the median value
# Use np.nanmean since there may be nans around the outliers
AIG_outlier_fixed = AIGِ_outlier_centered.copy()
AIG_outlier_fixed[outliers] = np.nanmedian(AIG_outlier_fixed)

In [None]:
fig, axs = plt.subplots(1, 2,sharey=True,figsize=(20, 10))
axs[0].plot(AIGِ_outlier_centered, label='AIG with outliers')
axs[1].plot(AIG_outlier_fixed, label='AIG without outliers')

axs[0].legend()
axs[1].legend()

axs[0].set(xlabel='Time (Years)')
axs[1].set(xlabel='Time (Years)')
plt.suptitle('AIG stock prices with outliers Vs AIG stock prices without outliers ')
axs[0].set(ylabel='Percentage chnage in the stock market value')
    

## 3.3. Creating features over time

In [None]:
#Using .aggregate for feature extraction
# Visualize the raw data
preprocessed_prices.head(5)

In [None]:
# Calculate a rolling window, then extract two features
feats = preprocessed_prices.rolling(20).aggregate([np.std, np.max]).dropna()
feats.head(5)

In [None]:
feats['AAPL'].plot(figsize=(10, 10))
plt.xlabel('Date [Years] ')
plt.ylabel('Stock price')
plt.title('The std Vs max of the stock price change of Apple')

#### Using partial() in Python
A useful tool when using the dot-aggregate method is the partial function. This is built-in to Python, and lets you create a *new* function from an old one, with some of the parameters pre-configured.

In [None]:
# If we just take the mean, it returns a single value
a = np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])
print("Mean: {}".format(np.mean(a)))

In [None]:
# We can use the partial function to initialize np.mean
# with an axis parameter
from functools import partial
mean_over_first_axis = partial(np.mean, axis=0)
print("Mean over axis: {}".format(mean_over_first_axis(a)))

In [None]:
# Percentiles summarize your data
print("Percentile q20: {}".format(np.percentile(np.linspace(0, 200), q=20)))

In [None]:
# Combining np.percentile() with partial functions to calculate a range of percentiles and apply it on toy data
data = np.linspace(0, 100)
# Create a list of functions using a list comprehension
percentile_funcs = [partial(np.percentile, q=ii) for ii in [20, 40, 60]]
# Calculate the output of each function in the same way
percentiles = [i_func(data) for i_func in percentile_funcs]
print("Percentiles 20, 40, and 60: {}".format(percentiles))

In [None]:
# Calculate multiple percentiles of a rolling window on our prices time series
preprocessed_prices.rolling(20).aggregate(percentile_funcs).dropna().head(5)

#### Date and time features using Pandas

In [None]:
# Ensure our index is datetime
preprocessed_prices.index = pd.to_datetime(preprocessed_prices.index)

In [None]:
# Extract datetime features
day_of_week_num = preprocessed_prices.index.weekday
print('Days of the week in numbers:', day_of_week_num[:10])

In [None]:
day_of_week = preprocessed_prices.index.day_name()
print('Days of the week in names:', day_of_week[:10])

# 3. Evaluation and Inspecting Time Series Models

## 3.1. Creating features from the past


In [None]:
# create shifts in the data

# slice the AIG company data
rough_signal = preprocessed_prices['AIG']

# Shifts 
shifts = [1, 2, 3, 4, 5, 6, 7]

# Create a dictionary of time-shifted data
many_shifts = {'lag_{}'.format(ii): rough_signal.shift(ii) for ii in shifts}

# Convert the shifts into a data frame
many_shifts = pd.DataFrame(many_shifts)
many_shifts.fillna(0, inplace=True)
many_shifts.head(5)

In [None]:
# Fit the model using these input features
model = Ridge()
model.fit(many_shifts, rough_signal)

In [None]:
model.coef_ 

In [None]:
# Visualize the fit model coefficients
fig, axs = plt.subplots(1, 2,figsize=(20, 10))

axs[0].plot(many_shifts)


axs[1].bar(many_shifts.columns, model.coef_)
axs[1].set(xlabel='Coefficient name', ylabel='Coefficient value')


# Set formatting so it looks nice
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Model coefficients for each lag')

In [None]:
preprocessed_prices.columns.tolist()

# Applying to a smooth signal 

In [None]:
# create shifts in the data

# slice the AIG company data
smooth_signal = preprocessed_prices['INTC']

# Shifts 
shifts = [1, 2, 3, 4, 5, 6, 7]

# Create a dictionary of time-shifted data
many_shifts = {'lag_{}'.format(ii): smooth_signal.shift(ii) for ii in shifts}

# Convert the shifts into a data frame
many_shifts = pd.DataFrame(many_shifts)
many_shifts.fillna(0, inplace=True)
many_shifts.head(5)

In [None]:
many_shifts.plot(figsize=(10,10))

In [None]:
# Fit the model using these input features
model = Ridge()
model.fit(many_shifts, smooth_signal)

In [None]:
# Visualize the fit model coefficients
fig, axs = plt.subplots(1, 2,figsize=(20, 10))

axs[0].plot(many_shifts)

axs[1].bar(many_shifts.columns, model.coef_)
axs[1].set(xlabel='Coefficient name', ylabel='Coefficient value')

# Set formatting so it looks nice
plt.setp(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
plt.title('Model coefficients for each lag')

## 3.2. Cross-validating time-series data


In [None]:
X_df = X.copy()

In [None]:
X_df.head(3)

In [None]:
X.head(3)

In [None]:
def visualize_predictions(results):
    i = 0
    plt.figure(figsize=(10,10))
    for result in results:
        plt.plot(result[2],result[0],'o', label='iteration'+str(i))
        plt.legend()
        plt.xlabel('time')
        plt.title('predicition order by time')
        i = i+1
    
    i = 1
    plt.figure(figsize=(10,10))
    for result in results:
        plt.plot(np.arange((i-1)*len(result[0]),(i)*len(result[0])), result[0], 'o')
        plt.xlabel('time')
        plt.title('Prediction order by test prediction number')
        i = i+1

In [None]:
# Import ShuffleSplit and create the cross-validation object
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, random_state=1)

# Iterate through CV splits
results = []
for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    model.fit(X.iloc[rows_train], y.iloc[rows_train])

    # Generate predictions on the test data, score the predictions and collect
    prediction = model.predict(X.iloc[rows_test])
    score = r2_score(y.iloc[rows_test], prediction)
    results.append((prediction, score, rows_test))
    
# Custom function to quickly visualize predictions
visualize_predictions(results)

In [None]:
def visualize_predictions(results):
    i = 0
    plt.figure(figsize=(10,10))
    for result in results:
        plt.plot(result[1],result[0],'o', label='iteration'+str(i))
        plt.legend()
        plt.xlabel('time')
        plt.title('predicition order by time')
        i = i+1
    
    i = 1
    plt.figure(figsize=(10,10))
    for result in results:
        plt.plot(np.arange((i-1)*len(result[0]),(i)*len(result[0])), result[0], 'o')
        plt.xlabel('time')
        plt.title('Prediction order by test prediction number')
        i = i+1

In [None]:
# Create KFold cross-validation object
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=False)

# Iterate through CV splits
results = []
for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    model.fit(X.iloc[rows_train],y.iloc[rows_train])
    
    # Generate predictions on the test data and collect
    prediction = model.predict(X.iloc[rows_test])
    results.append((prediction, rows_test))
    
# Custom function to quickly visualize predictions
visualize_predictions(results)

In [None]:
# Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit

# Create a time-series cross-validation object
cv = TimeSeriesSplit(n_splits=10)

# Iterate through CV splits
fig, ax = plt.subplots()
for ii, (tr, tt) in enumerate(cv.split(X, y)):
    # Plot the training data on each iteration to see the behavior of the CV
    ax.plot(tr, ii + y.iloc[tr]/1000)
    
ax.set(title='Training data on each CV iteration', ylabel='CV iteration')
ax.set(xlabel='time')
plt.show()

## 3.3. Stationarity and stability

In [None]:
# Create KFold cross-validation object
from sklearn.model_selection import KFold
cv = KFold(n_splits=10, shuffle=False)
cv_coefficeints = []
# Iterate through CV splits
results = []

for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    model.fit(X.iloc[rows_train],y.iloc[rows_train])
    
    # Generate predictions on the test data and collect
    prediction = model.predict(X.iloc[rows_test])
    results.append((prediction, rows_test))
    cv_coefficeints.append(model.coef_)
    
# Custom function to quickly visualize predictions
visualize_predictions(results)

In [None]:
np.shape(cv_coefficeints)[-1]

In [None]:
# Bootstrapping the mean
from sklearn.utils import resample

# cv_coefficients has shape (n_cv_folds, n_coefficients)
n_coefficients = np.shape(cv_coefficeints)[-1]
n_boots = 100
bootstrap_means = np.zeros((n_boots, n_coefficients))
                           
for ii in range(n_boots):
    # Generate random indices for our data with replacement,
    # then take the sample mean
    random_sample = resample(cv_coefficeints)
    bootstrap_means[ii] = np.mean(random_sample,axis=0)
    
# Compute the percentiles of choice for the bootstrapped means
percentiles = np.percentile(bootstrap_means, (2.5, 97.5), axis=0)

# Plotting the bootstrapped coefficients
fig, ax = plt.subplots(figsize=(10,5))
ax.scatter(X_df.columns, percentiles[0], marker='_', s=200)
ax.scatter(X_df.columns, percentiles[1], marker='_', s=200)
ax.set(title='95% confidence intervals for model coefficients')

### Assessing model performance stability

In [None]:
# score function will be the correlation between the predicted and the true values
def correlation_coefficient(est, X, y):
    """Return the correlation coefficient
    between model predictions and a validation set."""
    score = np.corrcoef(np.hstack((y, est.predict(X))))[1, 0]
    return score

In [None]:
# Model performance over time
# define the cv split and the regression model 
cv = TimeSeriesSplit(n_splits=100)
model = Ridge()
first_indices = []

# Grab the date of the first index of each test set
for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    first_indices.append(X_df.index[rows_test[0]])
    
# Calculate the CV scores and convert to a Pandas Series
cv_scores = cross_val_score(model, X, y, cv=cv, scoring = correlation_coefficient)
cv_scores = pd.DataFrame(cv_scores, index=first_indices)

# Visualizing model scores as a timeseries
fig, axs = plt.subplots(2, 1, figsize=(20, 20), sharex=False)

# Calculate a rolling mean of scores over time
cv_scores_mean = cv_scores.rolling(10, min_periods=1).mean()
cv_scores_mean.plot(ax=axs[0])
axs[0].set(title='Validation scores (correlation)', ylim=[0, 1])

# Plot the raw data
X_df.plot(ax=axs[1])
axs[1].set(title='Validation data')

In [None]:
# Only keep the last 100 data points in the training data
window = 100

# Initialize the CV with this window size
cv = TimeSeriesSplit(n_splits=10, max_train_size=window)

model = Ridge()
first_indices = []

for rows_train, rows_test in cv.split(X, y):
    # Fit the model on training data
    first_indices.append(X_df.index[rows_test[0]])
    

model = Ridge()   
cv_scores = cross_val_score(model, X, y, cv=cv, scoring = correlation_coefficient)

# Calculate the CV scores and convert them to a Pandas Series
cv_scores = pd.DataFrame(cv_scores, index=first_indices)

#Visualizing model scores as a time series
fig, axs = plt.subplots(2, 1, figsize=(20, 20), sharex=False)

# Calculate a rolling mean of scores over time
cv_scores_mean = cv_scores.rolling(10, min_periods=1).mean()
cv_scores_mean.plot(ax=axs[0])
axs[0].set(title='Test scores (correlation)', ylim=[0, 1])

# Plot the raw data
X_df.plot(ax=axs[1])
axs[1].set(title='Test data')