-
Notifications
You must be signed in to change notification settings - Fork 139
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from hudson-and-thames/ml_based_pairs_selection
ML based Pairs Selection
- Loading branch information
Showing
26 changed files
with
2,842 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
""" | ||
This module houses the ML Based Approaches. | ||
""" | ||
|
||
from arbitragelab.ml_approach.pairs_selector import PairsSelector |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
# Copyright 2019, Hudson and Thames Quantitative Research | ||
# All rights reserved | ||
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html | ||
""" | ||
This module houses utility functions used by the PairsSelector. | ||
""" | ||
|
||
import sys | ||
import pandas as pd | ||
import numpy as np | ||
import statsmodels.api as sm | ||
from statsmodels.tsa.adfvalues import mackinnonp | ||
from statsmodels.tsa.stattools import adfuller | ||
from scipy.odr import ODR, Model, RealData | ||
|
||
|
||
def _print_progress(iteration, max_iterations, prefix='', suffix='', decimals=1, bar_length=50): | ||
# pylint: disable=expression-not-assigned | ||
""" | ||
Calls in a loop to create a terminal progress bar. | ||
https://gist.github.com/aubricus/f91fb55dc6ba5557fbab06119420dd6a | ||
:param iteration: (int) Current iteration. | ||
:param max_iterations: (int) Maximum number of iterations. | ||
:param prefix: (str) Prefix string. | ||
:param suffix: (str) Suffix string. | ||
:param decimals: (int) Positive number of decimals in percent completed. | ||
:param bar_length: (int) Character length of the bar. | ||
""" | ||
str_format = "{0:." + str(decimals) + "f}" | ||
# Calculate the percent completed. | ||
percents = str_format.format(100 * (iteration / float(max_iterations))) | ||
# Calculate the length of bar. | ||
filled_length = int(round(bar_length * iteration / float(max_iterations))) | ||
# Fill the bar. | ||
block = '█' * filled_length + '-' * (bar_length - filled_length) | ||
# Print new line. | ||
sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, block, percents, '%', suffix)), | ||
|
||
if iteration == max_iterations: | ||
sys.stdout.write('\n') | ||
sys.stdout.flush() | ||
|
||
|
||
def _outer_ou_loop(spreads_df: pd.DataFrame, test_period: str, | ||
cross_overs_per_delta: int, molecule: list) -> pd.DataFrame: | ||
# pylint: disable=too-many-locals | ||
""" | ||
This function gets mean reversion calculations (half-life and number of | ||
mean cross overs) for each pair in the molecule. Uses the linear regression | ||
method to get the half-life, which is much lighter computationally wise | ||
compared to the version using the OrnsteinUhlenbeck class. | ||
Note that when mean reversion is expected, lambda / StdErr has a negative value. | ||
This result implies that the expected duration of mean reversion lambda is | ||
inversely proportional to the absolute value of lambda. | ||
:param spreads_df: (pd.DataFrame) Spreads Universe. | ||
:param test_period: (str) Time delta format, to be used as the time | ||
period where the mean crossovers will be calculated. | ||
:param cross_overs_per_delta: (int) Crossovers per time delta selected. | ||
:param molecule: (list) Indices of pairs. | ||
:return: (pd.DataFrame) Mean Reversion statistics. | ||
""" | ||
|
||
ou_results = [] | ||
|
||
for iteration, pair in enumerate(molecule): | ||
|
||
spread = spreads_df.loc[:, str(pair)] | ||
lagged_spread = spread.shift(1).dropna(0) | ||
|
||
# Setup regression parameters. | ||
lagged_spread_c = sm.add_constant(lagged_spread) | ||
delta_y_t = np.diff(spread) | ||
|
||
model = sm.OLS(delta_y_t, lagged_spread_c) | ||
res = model.fit() | ||
|
||
# Split the spread in two periods. The training data is used to | ||
# extract the long term mean of the spread. Then the mean is used | ||
# to find the the number of crossovers in the test period. | ||
test_df = spread.last(test_period) | ||
train_df = spread.iloc[: -len(test_df)] | ||
|
||
long_term_mean = np.mean(train_df) | ||
|
||
centered_series = test_df - long_term_mean | ||
|
||
# Set the spread to a mean of zero and classifies each value | ||
# based on their sign. | ||
cross_over_indices = np.where(np.diff(np.sign(centered_series)))[0] | ||
cross_overs_dates = spreads_df.index[cross_over_indices] | ||
|
||
# Resample the mean crossovers series to yearly index and count | ||
# each occurence in each year. | ||
cross_overs_counts = cross_overs_dates.to_frame().resample('Y').count() | ||
cross_overs_counts.columns = ['counts'] | ||
|
||
# Check that the number of crossovers are in accordance with the given selection | ||
# criteria. | ||
cross_overs = len(cross_overs_counts[cross_overs_counts['counts'] > cross_overs_per_delta]) > 0 | ||
|
||
# Append half-life and number of cross overs. | ||
ou_results.append([np.log(2) / abs(res.params[0]), cross_overs]) | ||
|
||
_print_progress(iteration + 1, len(molecule), prefix='Outer OU Loop Progress:', | ||
suffix='Complete') | ||
|
||
return pd.DataFrame(ou_results, index=molecule, columns=['hl', 'crossovers']) | ||
|
||
def _linear_f(beta: np.array, x_variable: np.array) -> np.array: | ||
""" | ||
This is the helper linear model that is going to be used in the Orthogonal Regression. | ||
:param beta: (np.array) Model beta coefficient. | ||
:param x_variable: (np.array) Model X vector. | ||
:return: (np.array) Vector result of equation calculation. | ||
""" | ||
|
||
return beta[0]*x_variable + beta[1] | ||
|
||
def _outer_cointegration_loop(prices_df: pd.DataFrame, molecule: list) -> pd.DataFrame: | ||
""" | ||
This function calculates the Engle-Granger test for each pair in the molecule. Uses the Total | ||
Least Squares approach to take into consideration the variance of both price series. | ||
:param prices_df: (pd.DataFrame) Price Universe. | ||
:param molecule: (list) Indices of pairs. | ||
:return: (pd.DataFrame) Cointegration statistics. | ||
""" | ||
|
||
cointegration_results = [] | ||
|
||
for iteration, pair in enumerate(molecule): | ||
maxlag = None | ||
autolag = "aic" | ||
trend = "c" | ||
|
||
linear = Model(_linear_f) | ||
mydata = RealData(prices_df.loc[:, pair[0]], prices_df.loc[:, pair[1]]) | ||
myodr = ODR(mydata, linear, beta0=[1., 2.]) | ||
res_co = myodr.run() | ||
|
||
res_adf = adfuller(res_co.delta - res_co.eps, maxlag=maxlag, | ||
autolag=autolag, regression="nc") | ||
|
||
pval_asy = mackinnonp(res_adf[0], regression=trend) | ||
|
||
cointegration_results.append((res_adf[0], pval_asy, | ||
res_co.beta[0], res_co.beta[1])) | ||
|
||
_print_progress(iteration + 1, len(molecule), prefix='Outer Cointegration Loop Progress:', | ||
suffix='Complete') | ||
|
||
return pd.DataFrame(cointegration_results, | ||
index=molecule, | ||
columns=['coint_t', 'pvalue', 'hedge_ratio', 'constant']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
# Copyright 2019, Hudson and Thames Quantitative Research | ||
# All rights reserved | ||
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html | ||
""" | ||
This module is a user data helper wrapping various yahoo finance libraries. | ||
""" | ||
|
||
import pandas as pd | ||
import yfinance as yf | ||
import yahoo_fin.stock_info as ys | ||
|
||
class DataImporter: | ||
""" | ||
Wrapper class that imports data from yfinance and yahoo_fin. | ||
This class allows for fast pulling/mangling of information needed | ||
for the research process. These would include; ticker groups of | ||
various indexes, pulling of relevant pricing data and processing | ||
said data. | ||
""" | ||
|
||
@staticmethod | ||
def get_sp500_tickers() -> list: | ||
""" | ||
Gets all S&P 500 stock tickers. | ||
:return: (list) List of tickers. | ||
""" | ||
|
||
tickers_sp500 = ys.tickers_sp500() | ||
|
||
return tickers_sp500 | ||
|
||
@staticmethod | ||
def get_dow_tickers() -> list: | ||
""" | ||
Gets all DOW stock tickers. | ||
:return: (list) List of tickers. | ||
""" | ||
|
||
tickers_dow = ys.tickers_dow() | ||
|
||
return tickers_dow | ||
|
||
@staticmethod | ||
def remove_nuns(dataframe: pd.DataFrame, threshold: int = 100) -> pd.DataFrame: | ||
""" | ||
Remove tickers with nulls in value over a threshold. | ||
:param dataframe: (pd.DataFrame) Asset price data. | ||
:param threshold: (int) The number of null values allowed. | ||
:return dataframe: (pd.DataFrame) Price Data without any null values. | ||
""" | ||
|
||
null_sum_each_ticker = dataframe.isnull().sum() | ||
tickers_passing = null_sum_each_ticker[null_sum_each_ticker <= threshold] | ||
tickers_under_threshold = tickers_passing.index | ||
dataframe = dataframe[tickers_under_threshold] | ||
|
||
return dataframe | ||
|
||
@staticmethod | ||
def get_price_data(tickers: list, start_date: str, end_date: str, | ||
interval: str = '5m') -> pd.DataFrame: | ||
""" | ||
Get the price data with custom start and end date and interval. | ||
For daily price, only keep the closing price. | ||
:param tickers: (list) List of tickers to download. | ||
:param start_date: (str) Download start date string (YYYY-MM-DD). | ||
:param end_date: (str) Download end date string (YYYY-MM-DD). | ||
:param interval: (str) Valid intervals: [1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo]. | ||
:return: (pd.DataFrame) The requested price_data. | ||
""" | ||
|
||
price_data = yf.download(tickers, start=start_date, end=end_date, | ||
interval=interval, group_by='column')['Close'] | ||
|
||
return price_data | ||
|
||
@staticmethod | ||
def get_returns_data(price_data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Calculate return data with custom start and end date and interval. | ||
:param price_data: (pd.DataFrame) Asset price data. | ||
:return: (pd.DataFrame) Price Data converted to returns. | ||
""" | ||
|
||
returns_data = price_data.pct_change() | ||
returns_data = returns_data.iloc[1:] | ||
|
||
return returns_data | ||
|
||
def get_ticker_sector_info(self, tickers: list, yf_call_chunk: int = 20) -> pd.DataFrame: | ||
""" | ||
This method will loop through all the tickers, using the yfinance library | ||
do a ticker info request and retrieve back 'sector' and 'industry' information. | ||
This method uses the yfinance 'Tickers' object which has a limit of the amount of | ||
tickers supplied as a string argument. To go around this, this method uses the | ||
chunking approach, where the supplied ticker list is broken down into small chunks | ||
and supplied sequentially to the helper function. | ||
:param tickers: (list) List of asset symbols. | ||
:param yf_call_chunk: (int) Ticker values allowed per 'Tickers' | ||
object. This should always be less than 200. | ||
:return: (pd.DataFrame) DataFrame with input asset tickers and their | ||
respective sector and industry information. | ||
""" | ||
|
||
ticker_sector_queue = [] | ||
|
||
# For each chunk of size 'yf_call_chunk'. | ||
for i in range(0, len(tickers), yf_call_chunk): | ||
|
||
# Set end as the limit value equals to the chunk size. | ||
# If we hit the last chunk, set the end value as the | ||
# full length of the ticker list. | ||
end = i+yf_call_chunk if i <= len(tickers) else len(tickers) | ||
|
||
ticker_sector_queue.append(self._sector_info_helper(tickers[i: end])) | ||
|
||
return pd.concat(ticker_sector_queue, axis=0).reset_index(drop=True) | ||
|
||
@staticmethod | ||
def _sector_info_helper(tickers: list) -> pd.DataFrame: | ||
""" | ||
Helper method to supply chunked sector info to the main method. | ||
:param tickers: (list) List of asset symbols. | ||
:return: (pd.DataFrame) DataFrame with input asset tickers and their respective sector | ||
and industry information. | ||
""" | ||
|
||
tckrs = yf.Tickers(' '.join(tickers)) | ||
|
||
tckr_info = [] | ||
|
||
for i, tckr in enumerate(tickers): | ||
ticker_info = tckrs.tickers[i].info | ||
tckr_tuple = (tckr, ticker_info['industry'], ticker_info['sector']) | ||
tckr_info.append(tckr_tuple) | ||
|
||
return pd.DataFrame(data=tckr_info, columns=['ticker', 'industry', 'sector']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Copyright 2019, Hudson and Thames Quantitative Research | ||
# All rights reserved | ||
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html | ||
""" | ||
This module houses the extension HighlightingDataCursor class to support cluster | ||
by cluster highlighting. | ||
""" | ||
|
||
import matplotlib.pyplot as plt | ||
from mpldatacursor import DataCursor, HighlightingDataCursor | ||
|
||
class IndexedHighlight(HighlightingDataCursor): | ||
""" | ||
This class extends HighlightingDataCursor to add support for | ||
highlighting of cluster groups. | ||
""" | ||
|
||
def __init__(self, axes, **kwargs): | ||
""" | ||
Initializes the highlighting object for each AxesSubplot in a plot. | ||
""" | ||
artists = axes | ||
|
||
kwargs['display'] = 'single' | ||
HighlightingDataCursor.__init__(self, artists, **kwargs) | ||
self.highlights = [self.create_highlight(artist) for artist in artists] | ||
plt.setp(self.highlights, visible=False) | ||
|
||
def update(self, event, annotation): | ||
""" | ||
On each update event, this method will loop through all SubPlot objects | ||
and the group of points corresponding to the current selected object | ||
will be highlighted. | ||
""" | ||
|
||
# Hide all other annotations | ||
plt.setp(self.highlights, visible=False) | ||
|
||
for i, artst in enumerate(self.artists): | ||
if event.artist is artst: | ||
self.highlights[i].set(visible=True) | ||
|
||
DataCursor.update(self, event, annotation) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.