In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
%matplotlib inline

# Regression Analysis: Seasonal Effects with Sklearn Linear Regression
In this notebook, you will build a SKLearn linear regression model to predict Yen futures ("settle") returns with *lagged* Yen futures returns. 

In [2]:
# Futures contract on the Yen-dollar exchange rate:
# This is the continuous chain of the futures contracts that are 1 month to expiration
yen_futures = pd.read_csv(
    Path("../Resources/yen.csv"), index_col="Date", infer_datetime_format=True, parse_dates=True
)
yen_futures.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Resources/yen.csv'

In [None]:
# Trim the dataset to begin on January 1st, 1990
yen_futures = yen_futures.loc["1990-01-01":, :]
yen_futures.head()

# Data Preparation

### Returns

In [None]:
# Create a series using "Settle" price percentage returns, drop any nan"s, and check the results:
# (Make sure to multiply the pct_change() results by 100)
# In this case, you may have to replace inf, -inf values with np.nan"s
# YOUR CODE HERE!
yen_futures['Return'] = yen_futures['Settle'].pct_change() * 100
yen_futures = yen_futures.replace(-np.inf, np.nan).dropna()
yen_futures.head()

### Lagged Returns 

In [None]:
# Create a lagged return using the shift function
# YOUR CODE HERE!
yen_futures['Lagged Return'] = yen_futures['Return'].shift()
yen_futures = yen_futures.dropna()
yen_futures.head()

### Train Test Split

In [None]:
# Create a train/test split for the data using 2018-2019 for testing and the rest for training
train = yen_futures[:'2017']
test = yen_futures['2018':]

In [None]:
# Create four dataframes:
# X_train (training set using just the independent variables), X_test (test set of of just the independent variables)
# Y_train (training set using just the "y" variable, i.e., "Futures Return"), Y_test (test set of just the "y" variable):
# YOUR CODE HERE!
X_train = train.drop(columns=['Lagged Return', 'Return'])
X_test = test.drop(columns=['Lagged Return', 'Return'])
y_train = train['Return']
y_test = test['Return']

In [None]:
X_train

# Linear Regression Model

In [None]:
# Create a Linear Regression model and fit it to the training data
from sklearn.linear_model import LinearRegression

# Fit a SKLearn linear regression using just the training set (X_train, Y_train):
# YOUR CODE HERE!
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions using the Testing Data

Note: We want to evaluate the model using data that it has never seen before, in this case: X_test.

In [None]:
# Make a prediction of "y" values using just the test dataset
# YOUR CODE HERE!
pred_y = model.predict(X_test)

In [None]:
# Assemble actual y data (Y_test) with predicted y data (from just above) into two columns in a dataframe:
# YOUR CODE HERE!
actual = pd.DataFrame({'Actual':y_test, 'Predicted':pred_y})
actual.head()

In [None]:
# Plot the first 20 predictions vs the true values
# YOUR CODE HERE!
actual.head(20).plot(
    title='Yen Futures Returns',
    figsize = (15, 8))
plt.savefig('../Resources/returns.png')

# Out-of-Sample Performance

Evaluate the model using "out-of-sample" data (X_test and y_test)

In [None]:
from sklearn.metrics import mean_squared_error
# Calculate the mean_squared_error (MSE) on actual versus predicted test "y" 
# YOUR CODE HERE!
mse = mean_squared_error(y_test, pred_y)
# Using that mean-squared-error, calculate the root-mean-squared error (RMSE):
# YOUR CODE HERE!
rmse = np.sqrt(mse)
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')

# In-Sample Performance

Evaluate the model using in-sample data (X_train and y_train)

In [None]:
# Construct a dataframe using just the "y" training data:
# YOUR CODE HERE!
in_sample = pd.DataFrame({'Training Y':y_train})
# Add a column of "in-sample" predictions to that dataframe:  
# YOUR CODE HERE!
in_pred_y = model.predict(X_train)
in_sample['In-Sample'] = in_pred_y
in_sample.head(20).plot(
    title='In-Sample Performance',
    figsize = (15, 8))
plt.savefig('../Resources/in_sample.png')
# Calculate in-sample mean_squared_error (for comparison to out-of-sample)
# YOUR CODE HERE!
in_mse = mean_squared_error(y_train, in_pred_y)
# Calculate in-sample root mean_squared_error (for comparison to out-of-sample)
# YOUR CODE HERE!
in_rmse = np.sqrt(in_mse)
print(f'MSE: {mse}\tIn-Sample MSE: {in_mse}')
print(f'RMSE: {rmse}\tIn-Sample RMSE: {in_rmse}')


In [None]:
table = pd.DataFrame()
table['Out-of-Sample'] = [mse, rmse]
table['In-Sample'] = [in_mse, in_rmse]
table

# Conclusions

YOUR CONCLUSIONS HERE!