# Backtest of Model-Based Strategies | Price Lags
---

__Authors:__ Emily Bertani, Max Acheson, Josh Mischung  
__Data Source:__ Yahoo Finance accessed by `pandas-datareader` and `yfinance`  
__Strategy:__ Vectorized Price Lags  
__Instrument(s):__ SPY  

__Intended Use:__  
The primary objective of this notebook is to determine the optimal ML model(s) a strategy based on lagging percent changes. The model(s) chosen at the end of the notebook can be set as a constant, along with the strategy, allowing for further exploration of feature engineering and hyperparameter tuning.

<br>

## Imports & Functions
---
__Imports__

In [None]:
# Data Objects
import numpy as np
import pandas as pd
from datetime import datetime as dt, timedelta as td

# Plotting
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

# Data Collection
import pandas_datareader.data as pdr
import yfinance as yf

# Preprocessing and Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis as LDA,
    QuadraticDiscriminantAnalysis as QDA
)
from sklearn.svm import LinearSVC, SVC

# Evaluation
from sklearn.metrics import classification_report

# Settings
yf.pdr_override()


<br>

__Functions__

In [None]:
def create_lagged_series(symbol, start_date, end_date, lags=5):
    """This creates a Pandas DataFrame that stores the
    percentage returns of the adjusted closing value of
    a stock obtained from yahoo finance via panadas-datareader,
    along with a number of lagged returns from the prior
    trading days (lags default to 5 days). Trading volume,
    as well as the Direction from the previous day, are
    also included.
    
    Parameters
    ----------
    symbol : 'str'
        The ticker symbol to obtain from AlphaVantage
    start_date : 'datetime'
        The starting date of the series to obtain
    end_date : 'datetime'
        The ending date of the series to obtain
    lags : 'int', optional
        The number of days to 'lag' the series by
        
    Returns
    -------
    'pd.DataFrame'
        Contains the Adjusted Closing Price returns and lags
    """
    # Obtain stock pricing from Pandas-Datareader
    adj_start_date = start_date - td(days=365)
    ts = pdr.get_data_yahoo(
        symbol,
        start=adj_start_date.strftime('%Y-%m-%d'),
        end=end_date.strftime('%Y-%m-%d')
    )
    ts.index = ts.index.date
    
    # Create the new lagged DataFrame
    tslag = pd.DataFrame(index=ts.index)
    tslag['today'] = ts['Adj Close']
    tslag['volume'] = ts['Volume']
    
    # Create shifted lag series of prior trading period close values
    for i in range(0, lags):
        tslag[f"lag_{i+1}"] = ts['Adj Close'].shift(i+1)
        
    # Create returns DataFrame
    tsret = pd.DataFrame(index=tslag.index)
    tsret['volume'] = tslag['volume']
    tsret['today'] = tslag['today'].pct_change()

    # If any of the values of percentage returns eqal zero, set them to
    # a small number (stops issues with QDA model in scikit-learn)
    tsret.loc[tsret['today'].abs() < 0.0001, ['today']] = 0.0001
    
    # Create lagged percentage returns columns
    for i in range(0, lags):
        tsret[f"lag_{i+1}"] = tslag[f"lag_{i+1}"].pct_change()
        
    # Create "Direction" column (+1 or -1) indicating an up/down day
    tsret['direction'] = np.sign(tsret['today'])
    tsret = tsret[tsret.index >= start_date.date()]
    
    return tsret
