Skip to content

Commit

Permalink
Merge pull request #5 from hudson-and-thames/ml_based_pairs_selection
Browse files Browse the repository at this point in the history
ML based Pairs Selection
  • Loading branch information
PanPip committed Dec 9, 2020
2 parents 17c1f23 + 8a98d2e commit 87b2f79
Show file tree
Hide file tree
Showing 26 changed files with 2,842 additions and 0 deletions.
5 changes: 5 additions & 0 deletions arbitragelab/ml_approach/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
This module houses the ML Based Approaches.
"""

from arbitragelab.ml_approach.pairs_selector import PairsSelector
706 changes: 706 additions & 0 deletions arbitragelab/ml_approach/pairs_selector.py

Large diffs are not rendered by default.

157 changes: 157 additions & 0 deletions arbitragelab/ml_approach/stat_arb_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# Copyright 2019, Hudson and Thames Quantitative Research
# All rights reserved
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
"""
This module houses utility functions used by the PairsSelector.
"""

import sys
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.adfvalues import mackinnonp
from statsmodels.tsa.stattools import adfuller
from scipy.odr import ODR, Model, RealData


def _print_progress(iteration, max_iterations, prefix='', suffix='', decimals=1, bar_length=50):
# pylint: disable=expression-not-assigned
"""
Calls in a loop to create a terminal progress bar.
https://gist.github.com/aubricus/f91fb55dc6ba5557fbab06119420dd6a
:param iteration: (int) Current iteration.
:param max_iterations: (int) Maximum number of iterations.
:param prefix: (str) Prefix string.
:param suffix: (str) Suffix string.
:param decimals: (int) Positive number of decimals in percent completed.
:param bar_length: (int) Character length of the bar.
"""
str_format = "{0:." + str(decimals) + "f}"
# Calculate the percent completed.
percents = str_format.format(100 * (iteration / float(max_iterations)))
# Calculate the length of bar.
filled_length = int(round(bar_length * iteration / float(max_iterations)))
# Fill the bar.
block = '█' * filled_length + '-' * (bar_length - filled_length)
# Print new line.
sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, block, percents, '%', suffix)),

if iteration == max_iterations:
sys.stdout.write('\n')
sys.stdout.flush()


def _outer_ou_loop(spreads_df: pd.DataFrame, test_period: str,
cross_overs_per_delta: int, molecule: list) -> pd.DataFrame:
# pylint: disable=too-many-locals
"""
This function gets mean reversion calculations (half-life and number of
mean cross overs) for each pair in the molecule. Uses the linear regression
method to get the half-life, which is much lighter computationally wise
compared to the version using the OrnsteinUhlenbeck class.
Note that when mean reversion is expected, lambda / StdErr has a negative value.
This result implies that the expected duration of mean reversion lambda is
inversely proportional to the absolute value of lambda.
:param spreads_df: (pd.DataFrame) Spreads Universe.
:param test_period: (str) Time delta format, to be used as the time
period where the mean crossovers will be calculated.
:param cross_overs_per_delta: (int) Crossovers per time delta selected.
:param molecule: (list) Indices of pairs.
:return: (pd.DataFrame) Mean Reversion statistics.
"""

ou_results = []

for iteration, pair in enumerate(molecule):

spread = spreads_df.loc[:, str(pair)]
lagged_spread = spread.shift(1).dropna(0)

# Setup regression parameters.
lagged_spread_c = sm.add_constant(lagged_spread)
delta_y_t = np.diff(spread)

model = sm.OLS(delta_y_t, lagged_spread_c)
res = model.fit()

# Split the spread in two periods. The training data is used to
# extract the long term mean of the spread. Then the mean is used
# to find the the number of crossovers in the test period.
test_df = spread.last(test_period)
train_df = spread.iloc[: -len(test_df)]

long_term_mean = np.mean(train_df)

centered_series = test_df - long_term_mean

# Set the spread to a mean of zero and classifies each value
# based on their sign.
cross_over_indices = np.where(np.diff(np.sign(centered_series)))[0]
cross_overs_dates = spreads_df.index[cross_over_indices]

# Resample the mean crossovers series to yearly index and count
# each occurence in each year.
cross_overs_counts = cross_overs_dates.to_frame().resample('Y').count()
cross_overs_counts.columns = ['counts']

# Check that the number of crossovers are in accordance with the given selection
# criteria.
cross_overs = len(cross_overs_counts[cross_overs_counts['counts'] > cross_overs_per_delta]) > 0

# Append half-life and number of cross overs.
ou_results.append([np.log(2) / abs(res.params[0]), cross_overs])

_print_progress(iteration + 1, len(molecule), prefix='Outer OU Loop Progress:',
suffix='Complete')

return pd.DataFrame(ou_results, index=molecule, columns=['hl', 'crossovers'])

def _linear_f(beta: np.array, x_variable: np.array) -> np.array:
"""
This is the helper linear model that is going to be used in the Orthogonal Regression.
:param beta: (np.array) Model beta coefficient.
:param x_variable: (np.array) Model X vector.
:return: (np.array) Vector result of equation calculation.
"""

return beta[0]*x_variable + beta[1]

def _outer_cointegration_loop(prices_df: pd.DataFrame, molecule: list) -> pd.DataFrame:
"""
This function calculates the Engle-Granger test for each pair in the molecule. Uses the Total
Least Squares approach to take into consideration the variance of both price series.
:param prices_df: (pd.DataFrame) Price Universe.
:param molecule: (list) Indices of pairs.
:return: (pd.DataFrame) Cointegration statistics.
"""

cointegration_results = []

for iteration, pair in enumerate(molecule):
maxlag = None
autolag = "aic"
trend = "c"

linear = Model(_linear_f)
mydata = RealData(prices_df.loc[:, pair[0]], prices_df.loc[:, pair[1]])
myodr = ODR(mydata, linear, beta0=[1., 2.])
res_co = myodr.run()

res_adf = adfuller(res_co.delta - res_co.eps, maxlag=maxlag,
autolag=autolag, regression="nc")

pval_asy = mackinnonp(res_adf[0], regression=trend)

cointegration_results.append((res_adf[0], pval_asy,
res_co.beta[0], res_co.beta[1]))

_print_progress(iteration + 1, len(molecule), prefix='Outer Cointegration Loop Progress:',
suffix='Complete')

return pd.DataFrame(cointegration_results,
index=molecule,
columns=['coint_t', 'pvalue', 'hedge_ratio', 'constant'])
1 change: 1 addition & 0 deletions arbitragelab/network/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __check_api_key():
@staticmethod
def __import_libraries():
# pylint: disable=import-outside-toplevel, unused-import
import arbitragelab.ml_approach as ml_approach
import arbitragelab.codependence as codependence
import arbitragelab.cointegration_approach as cointegration_approach
import arbitragelab.distance_approach as distance_approach
Expand Down
2 changes: 2 additions & 0 deletions arbitragelab/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
Utility functions.
"""

from arbitragelab.util.data_importer import DataImporter
from arbitragelab.util.indexed_highlight import IndexedHighlight
from arbitragelab.util.generate_dataset import get_classification_data
146 changes: 146 additions & 0 deletions arbitragelab/util/data_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Copyright 2019, Hudson and Thames Quantitative Research
# All rights reserved
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
"""
This module is a user data helper wrapping various yahoo finance libraries.
"""

import pandas as pd
import yfinance as yf
import yahoo_fin.stock_info as ys

class DataImporter:
"""
Wrapper class that imports data from yfinance and yahoo_fin.
This class allows for fast pulling/mangling of information needed
for the research process. These would include; ticker groups of
various indexes, pulling of relevant pricing data and processing
said data.
"""

@staticmethod
def get_sp500_tickers() -> list:
"""
Gets all S&P 500 stock tickers.
:return: (list) List of tickers.
"""

tickers_sp500 = ys.tickers_sp500()

return tickers_sp500

@staticmethod
def get_dow_tickers() -> list:
"""
Gets all DOW stock tickers.
:return: (list) List of tickers.
"""

tickers_dow = ys.tickers_dow()

return tickers_dow

@staticmethod
def remove_nuns(dataframe: pd.DataFrame, threshold: int = 100) -> pd.DataFrame:
"""
Remove tickers with nulls in value over a threshold.
:param dataframe: (pd.DataFrame) Asset price data.
:param threshold: (int) The number of null values allowed.
:return dataframe: (pd.DataFrame) Price Data without any null values.
"""

null_sum_each_ticker = dataframe.isnull().sum()
tickers_passing = null_sum_each_ticker[null_sum_each_ticker <= threshold]
tickers_under_threshold = tickers_passing.index
dataframe = dataframe[tickers_under_threshold]

return dataframe

@staticmethod
def get_price_data(tickers: list, start_date: str, end_date: str,
interval: str = '5m') -> pd.DataFrame:
"""
Get the price data with custom start and end date and interval.
For daily price, only keep the closing price.
:param tickers: (list) List of tickers to download.
:param start_date: (str) Download start date string (YYYY-MM-DD).
:param end_date: (str) Download end date string (YYYY-MM-DD).
:param interval: (str) Valid intervals: [1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo].
:return: (pd.DataFrame) The requested price_data.
"""

price_data = yf.download(tickers, start=start_date, end=end_date,
interval=interval, group_by='column')['Close']

return price_data

@staticmethod
def get_returns_data(price_data: pd.DataFrame) -> pd.DataFrame:
"""
Calculate return data with custom start and end date and interval.
:param price_data: (pd.DataFrame) Asset price data.
:return: (pd.DataFrame) Price Data converted to returns.
"""

returns_data = price_data.pct_change()
returns_data = returns_data.iloc[1:]

return returns_data

def get_ticker_sector_info(self, tickers: list, yf_call_chunk: int = 20) -> pd.DataFrame:
"""
This method will loop through all the tickers, using the yfinance library
do a ticker info request and retrieve back 'sector' and 'industry' information.
This method uses the yfinance 'Tickers' object which has a limit of the amount of
tickers supplied as a string argument. To go around this, this method uses the
chunking approach, where the supplied ticker list is broken down into small chunks
and supplied sequentially to the helper function.
:param tickers: (list) List of asset symbols.
:param yf_call_chunk: (int) Ticker values allowed per 'Tickers'
object. This should always be less than 200.
:return: (pd.DataFrame) DataFrame with input asset tickers and their
respective sector and industry information.
"""

ticker_sector_queue = []

# For each chunk of size 'yf_call_chunk'.
for i in range(0, len(tickers), yf_call_chunk):

# Set end as the limit value equals to the chunk size.
# If we hit the last chunk, set the end value as the
# full length of the ticker list.
end = i+yf_call_chunk if i <= len(tickers) else len(tickers)

ticker_sector_queue.append(self._sector_info_helper(tickers[i: end]))

return pd.concat(ticker_sector_queue, axis=0).reset_index(drop=True)

@staticmethod
def _sector_info_helper(tickers: list) -> pd.DataFrame:
"""
Helper method to supply chunked sector info to the main method.
:param tickers: (list) List of asset symbols.
:return: (pd.DataFrame) DataFrame with input asset tickers and their respective sector
and industry information.
"""

tckrs = yf.Tickers(' '.join(tickers))

tckr_info = []

for i, tckr in enumerate(tickers):
ticker_info = tckrs.tickers[i].info
tckr_tuple = (tckr, ticker_info['industry'], ticker_info['sector'])
tckr_info.append(tckr_tuple)

return pd.DataFrame(data=tckr_info, columns=['ticker', 'industry', 'sector'])
43 changes: 43 additions & 0 deletions arbitragelab/util/indexed_highlight.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright 2019, Hudson and Thames Quantitative Research
# All rights reserved
# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
"""
This module houses the extension HighlightingDataCursor class to support cluster
by cluster highlighting.
"""

import matplotlib.pyplot as plt
from mpldatacursor import DataCursor, HighlightingDataCursor

class IndexedHighlight(HighlightingDataCursor):
"""
This class extends HighlightingDataCursor to add support for
highlighting of cluster groups.
"""

def __init__(self, axes, **kwargs):
"""
Initializes the highlighting object for each AxesSubplot in a plot.
"""
artists = axes

kwargs['display'] = 'single'
HighlightingDataCursor.__init__(self, artists, **kwargs)
self.highlights = [self.create_highlight(artist) for artist in artists]
plt.setp(self.highlights, visible=False)

def update(self, event, annotation):
"""
On each update event, this method will loop through all SubPlot objects
and the group of points corresponding to the current selected object
will be highlighted.
"""

# Hide all other annotations
plt.setp(self.highlights, visible=False)

for i, artst in enumerate(self.artists):
if event.artist is artst:
self.highlights[i].set(visible=True)

DataCursor.update(self, event, annotation)
2 changes: 2 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ Changelog
* :support:`17` Added Licence, ReadMe, and RoadMap
* :support:`20` Added API Key Validation.
* :support:`20` Add install documentation and test on OS/Ubuntu/Windows.
* :feature:`5` ML Based Pairs Selection (Horta, 2020) and Data Importer added.
* :support:`5` ML Based Pairs Selection and Data Importer documentation.

0 comments on commit 87b2f79

Please sign in to comment.