Merge pull request #5 from hudson-and-thames/ml_based_pairs_selection

ML based Pairs Selection
hudson-and-thames · Dec 9, 2020 · 87b2f79 · 87b2f79
2 parents 17c1f23 + 8a98d2e
commit 87b2f79
Show file tree

Hide file tree

Showing 26 changed files with 2,842 additions and 0 deletions.
diff --git a/arbitragelab/ml_approach/__init__.py b/arbitragelab/ml_approach/__init__.py
@@ -0,0 +1,5 @@
+"""
+This module houses the ML Based Approaches.
+"""
+
+from arbitragelab.ml_approach.pairs_selector import PairsSelector
diff --git a/arbitragelab/ml_approach/pairs_selector.py b/arbitragelab/ml_approach/pairs_selector.py
diff --git a/arbitragelab/ml_approach/stat_arb_utils.py b/arbitragelab/ml_approach/stat_arb_utils.py
@@ -0,0 +1,157 @@
+# Copyright 2019, Hudson and Thames Quantitative Research
+# All rights reserved
+# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
+"""
+This module houses utility functions used by the PairsSelector.
+"""
+
+import sys
+import pandas as pd
+import numpy as np
+import statsmodels.api as sm
+from statsmodels.tsa.adfvalues import mackinnonp
+from statsmodels.tsa.stattools import adfuller
+from scipy.odr import ODR, Model, RealData
+
+
+def _print_progress(iteration, max_iterations, prefix='', suffix='', decimals=1, bar_length=50):
+    # pylint: disable=expression-not-assigned
+    """
+    Calls in a loop to create a terminal progress bar.
+    https://gist.github.com/aubricus/f91fb55dc6ba5557fbab06119420dd6a
+    :param iteration: (int) Current iteration.
+    :param max_iterations: (int) Maximum number of iterations.
+    :param prefix: (str) Prefix string.
+    :param suffix: (str) Suffix string.
+    :param decimals: (int) Positive number of decimals in percent completed.
+    :param bar_length: (int) Character length of the bar.
+    """
+    str_format = "{0:." + str(decimals) + "f}"
+    # Calculate the percent completed.
+    percents = str_format.format(100 * (iteration / float(max_iterations)))
+    # Calculate the length of bar.
+    filled_length = int(round(bar_length * iteration / float(max_iterations)))
+    # Fill the bar.
+    block = '█' * filled_length + '-' * (bar_length - filled_length)
+    # Print new line.
+    sys.stdout.write('\r%s |%s| %s%s %s' % (prefix, block, percents, '%', suffix)),
+
+    if iteration == max_iterations:
+        sys.stdout.write('\n')
+    sys.stdout.flush()
+
+
+def _outer_ou_loop(spreads_df: pd.DataFrame, test_period: str,
+                   cross_overs_per_delta: int, molecule: list) -> pd.DataFrame:
+    # pylint: disable=too-many-locals
+    """
+    This function gets mean reversion calculations (half-life and number of
+    mean cross overs) for each pair in the molecule. Uses the linear regression
+    method to get the half-life, which is much lighter computationally wise
+    compared to the version using the OrnsteinUhlenbeck class.
+
+    Note that when mean reversion is expected, lambda / StdErr has a negative value.
+    This result implies that the expected duration of mean reversion lambda is
+    inversely proportional to the absolute value of lambda.
+
+    :param spreads_df: (pd.DataFrame) Spreads Universe.
+    :param test_period: (str) Time delta format, to be used as the time
+        period where the mean crossovers will be calculated.
+    :param cross_overs_per_delta: (int) Crossovers per time delta selected.
+    :param molecule: (list) Indices of pairs.
+    :return: (pd.DataFrame) Mean Reversion statistics.
+    """
+
+    ou_results = []
+
+    for iteration, pair in enumerate(molecule):
+
+        spread = spreads_df.loc[:, str(pair)]
+        lagged_spread = spread.shift(1).dropna(0)
+
+        # Setup regression parameters.
+        lagged_spread_c = sm.add_constant(lagged_spread)
+        delta_y_t = np.diff(spread)
+
+        model = sm.OLS(delta_y_t, lagged_spread_c)
+        res = model.fit()
+
+        # Split the spread in two periods. The training data is used to
+        # extract the long term mean of the spread. Then the mean is used
+        # to find the the number of crossovers in the test period.
+        test_df = spread.last(test_period)
+        train_df = spread.iloc[: -len(test_df)]
+
+        long_term_mean = np.mean(train_df)
+
+        centered_series = test_df - long_term_mean
+
+        # Set the spread to a mean of zero and classifies each value
+        # based on their sign.
+        cross_over_indices = np.where(np.diff(np.sign(centered_series)))[0]
+        cross_overs_dates = spreads_df.index[cross_over_indices]
+
+        # Resample the mean crossovers series to yearly index and count
+        # each occurence in each year.
+        cross_overs_counts = cross_overs_dates.to_frame().resample('Y').count()
+        cross_overs_counts.columns = ['counts']
+
+        # Check that the number of crossovers are in accordance with the given selection
+        # criteria.
+        cross_overs = len(cross_overs_counts[cross_overs_counts['counts'] > cross_overs_per_delta]) > 0
+
+        # Append half-life and number of cross overs.
+        ou_results.append([np.log(2) / abs(res.params[0]), cross_overs])
+
+        _print_progress(iteration + 1, len(molecule), prefix='Outer OU Loop Progress:',
+                        suffix='Complete')
+
+    return pd.DataFrame(ou_results, index=molecule, columns=['hl', 'crossovers'])
+
+def _linear_f(beta: np.array, x_variable: np.array) -> np.array:
+    """
+    This is the helper linear model that is going to be used in the Orthogonal Regression.
+
+    :param beta: (np.array) Model beta coefficient.
+    :param x_variable: (np.array) Model X vector.
+    :return: (np.array) Vector result of equation calculation.
+    """
+
+    return beta[0]*x_variable + beta[1]
+
+def _outer_cointegration_loop(prices_df: pd.DataFrame, molecule: list) -> pd.DataFrame:
+    """
+    This function calculates the Engle-Granger test for each pair in the molecule. Uses the Total
+    Least Squares approach to take into consideration the variance of both price series.
+
+    :param prices_df: (pd.DataFrame) Price Universe.
+    :param molecule: (list) Indices of pairs.
+    :return: (pd.DataFrame) Cointegration statistics.
+    """
+
+    cointegration_results = []
+
+    for iteration, pair in enumerate(molecule):
+        maxlag = None
+        autolag = "aic"
+        trend = "c"
+
+        linear = Model(_linear_f)
+        mydata = RealData(prices_df.loc[:, pair[0]], prices_df.loc[:, pair[1]])
+        myodr = ODR(mydata, linear, beta0=[1., 2.])
+        res_co = myodr.run()
+
+        res_adf = adfuller(res_co.delta - res_co.eps, maxlag=maxlag,
+                           autolag=autolag, regression="nc")
+
+        pval_asy = mackinnonp(res_adf[0], regression=trend)
+
+        cointegration_results.append((res_adf[0], pval_asy,
+                                      res_co.beta[0], res_co.beta[1]))
+
+        _print_progress(iteration + 1, len(molecule), prefix='Outer Cointegration Loop Progress:',
+                        suffix='Complete')
+
+    return pd.DataFrame(cointegration_results,
+                        index=molecule,
+                        columns=['coint_t', 'pvalue', 'hedge_ratio', 'constant'])
diff --git a/arbitragelab/network/validation.py b/arbitragelab/network/validation.py
@@ -41,6 +41,7 @@ def __check_api_key():
     @staticmethod
     def __import_libraries():
         # pylint: disable=import-outside-toplevel, unused-import
+        import arbitragelab.ml_approach as ml_approach
         import arbitragelab.codependence as codependence
         import arbitragelab.cointegration_approach as cointegration_approach
         import arbitragelab.distance_approach as distance_approach

diff --git a/arbitragelab/util/__init__.py b/arbitragelab/util/__init__.py
@@ -2,4 +2,6 @@
 Utility functions.
 """
 
+from arbitragelab.util.data_importer import DataImporter
+from arbitragelab.util.indexed_highlight import IndexedHighlight
 from arbitragelab.util.generate_dataset import get_classification_data
diff --git a/arbitragelab/util/data_importer.py b/arbitragelab/util/data_importer.py
@@ -0,0 +1,146 @@
+# Copyright 2019, Hudson and Thames Quantitative Research
+# All rights reserved
+# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
+"""
+This module is a user data helper wrapping various yahoo finance libraries.
+"""
+
+import pandas as pd
+import yfinance as yf
+import yahoo_fin.stock_info as ys
+
+class DataImporter:
+    """
+    Wrapper class that imports data from yfinance and yahoo_fin.
+
+    This class allows for fast pulling/mangling of information needed
+    for the research process. These would include; ticker groups of
+    various indexes, pulling of relevant pricing data and processing
+    said data.
+    """
+
+    @staticmethod
+    def get_sp500_tickers() -> list:
+        """
+        Gets all S&P 500 stock tickers.
+
+        :return: (list) List of tickers.
+        """
+
+        tickers_sp500 = ys.tickers_sp500()
+
+        return tickers_sp500
+
+    @staticmethod
+    def get_dow_tickers() -> list:
+        """
+        Gets all DOW stock tickers.
+
+        :return: (list) List of tickers.
+        """
+
+        tickers_dow = ys.tickers_dow()
+
+        return tickers_dow
+
+    @staticmethod
+    def remove_nuns(dataframe: pd.DataFrame, threshold: int = 100) -> pd.DataFrame:
+        """
+        Remove tickers with nulls in value over a threshold.
+
+        :param dataframe: (pd.DataFrame) Asset price data.
+        :param threshold: (int) The number of null values allowed.
+        :return dataframe: (pd.DataFrame) Price Data without any null values.
+        """
+
+        null_sum_each_ticker = dataframe.isnull().sum()
+        tickers_passing = null_sum_each_ticker[null_sum_each_ticker <= threshold]
+        tickers_under_threshold = tickers_passing.index
+        dataframe = dataframe[tickers_under_threshold]
+
+        return dataframe
+
+    @staticmethod
+    def get_price_data(tickers: list, start_date: str, end_date: str,
+                       interval: str = '5m') -> pd.DataFrame:
+        """
+        Get the price data with custom start and end date and interval.
+        For daily price, only keep the closing price.
+
+        :param tickers: (list) List of tickers to download.
+        :param start_date: (str) Download start date string (YYYY-MM-DD).
+        :param end_date: (str) Download end date string (YYYY-MM-DD).
+        :param interval: (str) Valid intervals: [1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo].
+        :return: (pd.DataFrame) The requested price_data.
+        """
+
+        price_data = yf.download(tickers, start=start_date, end=end_date,
+                                 interval=interval, group_by='column')['Close']
+
+        return price_data
+
+    @staticmethod
+    def get_returns_data(price_data: pd.DataFrame) -> pd.DataFrame:
+        """
+        Calculate return data with custom start and end date and interval.
+
+        :param price_data: (pd.DataFrame) Asset price data.
+        :return: (pd.DataFrame) Price Data converted to returns.
+        """
+
+        returns_data = price_data.pct_change()
+        returns_data = returns_data.iloc[1:]
+
+        return returns_data
+
+    def get_ticker_sector_info(self, tickers: list, yf_call_chunk: int = 20) -> pd.DataFrame:
+        """
+        This method will loop through all the tickers, using the yfinance library
+        do a ticker info request and retrieve back 'sector' and 'industry' information.
+
+        This method uses the yfinance 'Tickers' object which has a limit of the amount of
+        tickers supplied as a string argument. To go around this, this method uses the
+        chunking approach, where the supplied ticker list is broken down into small chunks
+        and supplied sequentially to the helper function.
+
+        :param tickers: (list) List of asset symbols.
+        :param yf_call_chunk: (int) Ticker values allowed per 'Tickers'
+            object. This should always be less than 200.
+        :return: (pd.DataFrame) DataFrame with input asset tickers and their
+            respective sector and industry information.
+        """
+
+        ticker_sector_queue = []
+
+        # For each chunk of size 'yf_call_chunk'.
+        for i in range(0, len(tickers), yf_call_chunk):
+
+            # Set end as the limit value equals to the chunk size.
+            # If we hit the last chunk, set the end value as the
+            # full length of the ticker list.
+            end = i+yf_call_chunk if i <= len(tickers) else len(tickers)
+
+            ticker_sector_queue.append(self._sector_info_helper(tickers[i: end]))
+
+        return pd.concat(ticker_sector_queue, axis=0).reset_index(drop=True)
+
+    @staticmethod
+    def _sector_info_helper(tickers: list) -> pd.DataFrame:
+        """
+        Helper method to supply chunked sector info to the main method.
+
+        :param tickers: (list) List of asset symbols.
+        :return: (pd.DataFrame) DataFrame with input asset tickers and their respective sector
+            and industry information.
+        """
+
+        tckrs = yf.Tickers(' '.join(tickers))
+
+        tckr_info = []
+
+        for i, tckr in enumerate(tickers):
+            ticker_info = tckrs.tickers[i].info
+            tckr_tuple = (tckr, ticker_info['industry'], ticker_info['sector'])
+            tckr_info.append(tckr_tuple)
+
+        return pd.DataFrame(data=tckr_info, columns=['ticker', 'industry', 'sector'])
diff --git a/arbitragelab/util/indexed_highlight.py b/arbitragelab/util/indexed_highlight.py
@@ -0,0 +1,43 @@
+# Copyright 2019, Hudson and Thames Quantitative Research
+# All rights reserved
+# Read more: https://hudson-and-thames-arbitragelab.readthedocs-hosted.com/en/latest/additional_information/license.html
+"""
+This module houses the extension HighlightingDataCursor class to support cluster
+by cluster highlighting.
+"""
+
+import matplotlib.pyplot as plt
+from mpldatacursor import DataCursor, HighlightingDataCursor
+
+class IndexedHighlight(HighlightingDataCursor):
+    """
+    This class extends HighlightingDataCursor to add support for
+    highlighting of cluster groups.
+    """
+
+    def __init__(self, axes, **kwargs):
+        """
+        Initializes the highlighting object for each AxesSubplot in a plot.
+        """
+        artists = axes
+
+        kwargs['display'] = 'single'
+        HighlightingDataCursor.__init__(self, artists, **kwargs)
+        self.highlights = [self.create_highlight(artist) for artist in artists]
+        plt.setp(self.highlights, visible=False)
+
+    def update(self, event, annotation):
+        """
+        On each update event, this method will loop through all SubPlot objects
+        and the group of points corresponding to the current selected object
+        will be highlighted.
+        """
+
+        # Hide all other annotations
+        plt.setp(self.highlights, visible=False)
+
+        for i, artst in enumerate(self.artists):
+            if event.artist is artst:
+                self.highlights[i].set(visible=True)
+
+        DataCursor.update(self, event, annotation)
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -31,3 +31,5 @@ Changelog
 * :support:`17` Added Licence, ReadMe, and RoadMap
 * :support:`20` Added API Key Validation.
 * :support:`20` Add install documentation and test on OS/Ubuntu/Windows.
+* :feature:`5` ML Based Pairs Selection (Horta, 2020) and Data Importer added.
+* :support:`5` ML Based Pairs Selection and Data Importer documentation.