# Build a Long/Short Pairs Portfolio to maximum the PnL

1.   Base on **stocksInfo** and **researchData** to **identify  Pairs** and calculate **trading parameters**. 
2.   Based on **testData** to **backtest** the Pairs portfolio with **signals** and **dollarValue**
3.   Calculate the **PnL** of the backtested Pairs portfolio



# Rules
 

*   **No lookahead bias**: The testData cannot be used for Pairs identification nor the trading parameters calculations
*   **No overfitting**: The Pairs cannot be hand-picked and must be based on rules. Similarly for the trading parameters and dollarValue, apart from the obvious numbers (e.g. 0.05, 0.1, 0.5, 1, 100, 1000 etc.), they must be based on rules also.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from numba import jit, njit, vectorize, cuda, uint32, f8, uint8
%load_ext google.colab.data_table 
%matplotlib inline

# Download and import pairslib for calculating PnL
!wget https://github.com/kenwkliu/ideas/raw/master/colab/pairslib.py
import pairslib

# Load the stockInfo, researchData and testData
stocksInfo = pd.read_excel('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/hkStocksQuotes.xlsx')
researchData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/researchHKStocksAdjClosePx.csv', index_col=0)
testData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/testHKStocksAdjClosePx.csv', index_col=0)


--2021-10-28 09:33:05--  https://github.com/kenwkliu/ideas/raw/master/colab/pairslib.py
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/pairslib.py [following]
--2021-10-28 09:33:06--  https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/pairslib.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2949 (2.9K) [text/plain]
Saving to: ‘pairslib.py’


2021-10-28 09:33:06 (26.7 MB/s) - ‘pairslib.py’ saved [2949/2949]



In [2]:
!pip install tslearn
!pip install numba
!find / -iname 'libdevice'
!find / -iname 'libnvvm.so'

Collecting tslearn
  Downloading tslearn-0.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (793 kB)
[?25l[K     |▍                               | 10 kB 22.3 MB/s eta 0:00:01[K     |▉                               | 20 kB 21.3 MB/s eta 0:00:01[K     |█▎                              | 30 kB 15.6 MB/s eta 0:00:01[K     |█▋                              | 40 kB 14.1 MB/s eta 0:00:01[K     |██                              | 51 kB 5.5 MB/s eta 0:00:01[K     |██▌                             | 61 kB 5.9 MB/s eta 0:00:01[K     |███                             | 71 kB 5.4 MB/s eta 0:00:01[K     |███▎                            | 81 kB 6.1 MB/s eta 0:00:01[K     |███▊                            | 92 kB 6.4 MB/s eta 0:00:01[K     |████▏                           | 102 kB 5.3 MB/s eta 0:00:01[K     |████▌                           | 112 kB 5.3 MB/s eta 0:00:01[K     |█████                           | 122 kB 5.3 MB/s eta 0

In [3]:
stocksInfo['turnover'] = round(stocksInfo['regularMarketPreviousClose'] * stocksInfo['averageDailyVolume10Day'])

QUOTE_TYPE = 'EQUITY'
MIN_TURNOVER = 100000000  # 100 millions
SELECTED_COLUMNS = ['code', 'shortName', 'industry', 'sector', 'turnover']

stocksFilteredInfo = stocksInfo[(stocksInfo.quoteType == QUOTE_TYPE) & (stocksInfo.turnover > MIN_TURNOVER) & (stocksInfo.industry != 'ETF')].reset_index()
stocksFilteredInfo.sort_values(by=['turnover'], ascending=False)[SELECTED_COLUMNS]

def getCorrelatedPairs(stocksCorr, THRESHOLD=0.95):
    # filter the pairs with correlation values above the THRESHOLD
    highCorr = stocksCorr[((stocksCorr >= THRESHOLD) & (stocksCorr < 1))]
    highCorr = highCorr.unstack().sort_values(ascending=False).drop_duplicates()
    highCorr.dropna(inplace=True)
    highCorrDf = highCorr.to_frame().reset_index()
    highCorrDf.rename(columns = {'level_0':'stockA', 'level_1':'stockB', 0:'corr'}, inplace=True)

    # looks up the sectors for the stocksA and stockB
    cols = ['stockA', 'stockB', 'corr', 'sector_A', 'sector_B']
    pairsDf = highCorrDf.merge(stocksFilteredInfo[['shortName', 'sector']], how='left', left_on='stockA', right_on='shortName').merge(stocksFilteredInfo[['shortName', 'sector']], how='left', left_on='stockB', right_on='shortName', suffixes=('_A', '_B'))[cols]
    pairsDf['sameSector'] = (pairsDf['sector_A'] == pairsDf['sector_B'])
    return pairsDf

pairsDf = getCorrelatedPairs(researchData.corr())
pairsDf

Unnamed: 0,stockA,stockB,corr,sector_A,sector_B,sameSector
0,XINYI SOLAR,FLAT GLASS,0.986801,Photovoltaic Solar,Glass strands,False
1,MEIDONG AUTO,MEITUAN-W,0.981320,Auto sales,E-commerce and Internet,False
2,ZIJIN MINING,MAN WAH HLDGS,0.976942,Precious metals,Housewares,False
3,ZHONGSHENG HLDG,TENCENT,0.973716,Auto sales,Online and Mobile Games,False
4,XINYI GLASS,XINYI SOLAR,0.973291,Glass strands,Photovoltaic Solar,False
...,...,...,...,...,...,...
59,GREATWALL MOTOR,ZIJIN MINING,0.950889,Automobile,Precious metals,False
60,CG SERVICES,EVERSUNSHINE LS,0.950751,Property Management and Agent,Property Management and Agent,True
61,WUXI APPTEC,TENCENT,0.950549,Healthcare,Online and Mobile Games,False
62,GEELY AUTO,CHINASOFT INT'L,0.950323,Automobile,Technology and Software,False


In [4]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
from tslearn.clustering import TimeSeriesKMeans

stock_names = researchData.columns #173 items, 249 days/A year

researchdata_np = researchData.reset_index(drop=True)
researchdata_np = researchdata_np.T.reset_index(drop=True)

researchdata_np = np.array(researchdata_np)
researchdata_np[np.isnan(researchdata_np)] = 300

normalizer = Normalizer()
researchData_np_normalised = normalizer.fit_transform(researchdata_np)

km3 = KMeans(n_clusters=3)
km3.fit(researchData_np_normalised)
km3_labels = km3.labels_

km4 = KMeans(n_clusters=4)
km4.fit(researchData_np_normalised)
km4_labels = km4.labels_

km10 = KMeans(n_clusters=10)
km10.fit(researchData_np_normalised)
km10_labels = km10.labels_

km15 = KMeans(n_clusters=15)
km15.fit(researchData_np_normalised)
km15_labels = km15.labels_

km20 = KMeans(n_clusters=20)
km20.fit(researchData_np_normalised)
km20_labels = km20.labels_

# Run time takes too long...
tskm = TimeSeriesKMeans(n_clusters=5, metric="dtw")
tskm.fit(researchData_np_normalised)
tskm_labels = tskm.labels_

  "Scikit-learn <0.24 will be deprecated in a "


In [5]:

# 1. Conduct PCA pairing,
# 2. For each pair, do Augmented Dickey-Fuller test
import numba
from statsmodels.tsa.stattools import adfuller

#@numba.jit
def get_pairs(column_name, labels, data, st):

    data[np.isnan(data)] = 300
    #data.fillna(0)
    clusters = {} 
    adfuller_lb = 0
    adfuller_ub = 0.05
    
    stat_test = st 

    assert len(column_name) == len(labels)
    for cn, la in zip(column_name, labels):

        if la not in clusters:
            clusters[la] = [cn]
        else:
            clusters[la] += [cn]

    pairs = []
    for key, val in clusters.items():
        if len(val) <= 1:
            continue

        for i in val:
            for j in val:
              if i != j:

                this_pair = data[[i, j]].copy()
                
                this_stock_pair_diff = this_pair[i] - this_pair[j]
                adfuller_result = stat_test(this_stock_pair_diff)
                p_value = adfuller_result[1]
                if adfuller_lb< p_value and p_value < adfuller_ub:
                    pairs.append((i, j))
    
    return pairs

my_trading_pairs = get_pairs(stock_names, tskm_labels, researchData, adfuller)

  import pandas.util.testing as tm


In [6]:
print(len(my_trading_pairs))
np.savetxt(f"pairs_{len(my_trading_pairs)}.csv", 
           my_trading_pairs,
           delimiter =", ", 
           fmt ='% s')

768


In [7]:
# Implement your logic to construct "pairsBackTest"
# pairsBackTest needs to be same format as in https://colab.research.google.com/github/kenwkliu/ideas/blob/master/colab/HKStocksCorrelation.ipynb
# It is a list of backtested Pairs
# Each backtested Pairs is a dataframe with at least these columns (Date, stockA, stockB, signal, dollarValue)

# signal is -1, 0, 1
# signal == -1: Long stockA Short stockB
# signal == 1: Short stockA Long stockB
# signal == 0: flat position
PAIR_STOCK_A, PAIR_STOCK_B = "HANG SENG BANK", "HSBC HOLDINGS"

pairResearchData = researchData[[PAIR_STOCK_A, PAIR_STOCK_B]].copy()
AB_stock = PAIR_STOCK_A + " / " + PAIR_STOCK_B
pairResearchData[AB_stock] = pairResearchData[PAIR_STOCK_A] / pairResearchData[PAIR_STOCK_B]
pairResearchData[[PAIR_STOCK_A, PAIR_STOCK_B, AB_stock]]

Unnamed: 0_level_0,HANG SENG BANK,HSBC HOLDINGS,HANG SENG BANK / HSBC HOLDINGS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-02,149.835770,58.445934,2.563665
2020-01-03,149.281525,57.966084,2.575325
2020-01-06,148.080612,57.582199,2.571639
2020-01-07,148.727264,57.486233,2.587181
2020-01-08,148.265396,56.910408,2.605242
...,...,...,...
2020-12-24,127.932472,38.726341,3.303500
2020-12-28,128.996964,38.533676,3.347642
2020-12-29,129.093719,38.774513,3.329345
2020-12-30,130.254990,39.111679,3.330335


In [8]:
# This is the original back-testing function
def researchTradingParams(researchData, stockA, stockB, threshold=0.05, dollarValue=10000):
    cols = [stockA, stockB]
    research_df = researchData[cols].copy()
    research_df.dropna(inplace = True)

    tradingParams = {}
    tradingParams['dollarValue'] = dollarValue

    # Calculate avgPxRatio for Exit (convergence)
    research_df['ratio'] = research_df[stockA] / research_df[stockB]
    avgPxRatio = research_df['ratio'].mean()
    tradingParams['avgPxRatio'] = avgPxRatio

    # Calculate shortA_longB_ratio for Entry (Divergence)
    shortA_longB_ratio = avgPxRatio * (1 + threshold)
    tradingParams['shortA_longB_ratio'] = shortA_longB_ratio

    # Calculate longA_shortB_ratio from Entry (Divergence)
    longA_shortB_ratio = avgPxRatio * (1 - threshold)
    tradingParams['longA_shortB_ratio'] = longA_shortB_ratio

    return tradingParams

def backTest(testData, tradingParams, stockA, stockB):
    cols = [stockA, stockB]
    backTest_df = testData[cols].copy()
    backTest_df.dropna(inplace = True)

    # Get the tradingParams
    dollarValue = tradingParams['dollarValue']
    avgPxRatio = tradingParams['avgPxRatio']
    shortA_longB_ratio = tradingParams['shortA_longB_ratio']
    longA_shortB_ratio = tradingParams['longA_shortB_ratio']

    # Calculate the Price ratio in backTest_df
    backTest_df['pxRatio'] = backTest_df[stockA] / backTest_df[stockB]
    backTest_df['dollarValue'] = dollarValue
    
    # initialize the signal to 0
    backTest_df['signal'] = 0
    signal = 0

    # Determine the signal in each row of the backTest_df
    for index, row in backTest_df.iterrows():

      pxRatio = row['pxRatio']

      # mark signal = 1 if pxRatio > shortA_longB_ratio (Diverge outside the upper band)
      if pxRatio > shortA_longB_ratio:
        signal = 1

      # mark signal = -1 if pxRatio < longA_shortB_ratio (Diverge outside the lower band)
      elif pxRatio < longA_shortB_ratio:
        signal = -1

      else:
        # continue to mark signal = 1 if previous signal == 1 and pxRatio > avgPxRatio (Trade entered but not converge back yet)
        if signal == 1 and pxRatio > avgPxRatio:
          signal = 1

        # continue to mark signal = -1 if previous signal == -1 and pxRatio < avgPxRatio (Trade entered but not converge back yet)
        elif signal == -1 and pxRatio < avgPxRatio:
          signal = -1

        else:
          signal = 0

      backTest_df.loc[index, 'signal'] = signal

    return backTest_df

PX_RATIO_THRESHOLD = 0.05
tradingParams = researchTradingParams(pairResearchData, PAIR_STOCK_A, PAIR_STOCK_B, PX_RATIO_THRESHOLD)
backTest_df = backTest(testData, tradingParams, PAIR_STOCK_A, PAIR_STOCK_B)

# Calcuate the PnL in test period
pnl, pnl_df = pairslib.calcPnl(backTest_df)
print("Reference PnL: ", pnl)

Reference PnL:  1552.0709377264266


In [9]:
def researchTradingParams(researchData, stockA, stockB, threshold=0.05, dollarValue=10000):
    cols = [stockA, stockB]
    research_df = researchData[cols].copy()
    
    research_df[stockA].fillna(value=research_df[stockA].mean(), inplace=True)
    research_df[stockB].fillna(value=research_df[stockB].mean(), inplace=True)
    
    #research_df.dropna(inplace = True)
    #research_df[stockA][np.isnan(data)] = 300

    tradingParams = {}
    tradingParams['dollarValue'] = dollarValue

    # Calculate avgPxRatio for Exit (convergence)
    research_df['spread'] = research_df[stockA] - research_df[stockB]
    
    avgPxRatio = research_df['spread'].mean()
    tradingParams['avgPxRatio'] = avgPxRatio

    # Calculate Positive Thresholding
    df_mean = research_df['spread'].mean()
    df_std = research_df['spread'].std()

    #print('df_mean, df_std: ', df_mean, df_std)
    # Assuming Normal distribution
    upper_tail = df_mean + 0.725*df_std
    lower_tail = df_mean - 0.725*df_std

    tradingParams['upper_tail'] = upper_tail
    tradingParams['lower_tail'] = lower_tail
    tradingParams['mean'] = df_mean
    tradingParams['std'] = df_std

    return tradingParams

def backTest(testData, tradingParams, stockA, stockB):
    cols = [stockA, stockB]
    backTest_df = testData[cols].copy()
    backTest_df.dropna(inplace = True)

    # Get the tradingParams
    dollarValue = tradingParams['dollarValue']
    backTest_df['dollarValue'] = dollarValue

    df_mean = tradingParams['mean']
    df_std = tradingParams['std']
    upper_tail = df_mean + 0.725*df_std
    lower_tail = df_mean - 0.725*df_std

    # initialize the signal to 0
    backTest_df['signal'] = 0
    signal = 0

    # Determine the signal in each row of the backTest_df
    for index, row in backTest_df.iterrows():
      
      if index == 0:
        df_std = df_std
      else:
        df_std = df_std + 0.1*np.random.normal() # simple voliatilty modelling

      spread = row[stockA] - row[stockB]

      # mark signal = 1 if pxRatio > shortA_longB_ratio (Diverge outside the upper band)
      if spread < df_mean + 0.725*df_std:
        signal = 1

      # mark signal = -1 if pxRatio < longA_shortB_ratio (Diverge outside the lower band)
      elif spread > df_mean - 0.725*df_std:
        signal = -1

      else:
        # continue to mark signal = 1 if previous signal == 1 and pxRatio > avgPxRatio (Trade entered but not converge back yet)
        if signal == 1 and spread > df_mean  :
          signal = 1

        # continue to mark signal = -1 if previous signal == -1 and pxRatio < avgPxRatio (Trade entered but not converge back yet)
        elif signal == -1 and spread < df_mean :
          signal = -1

        else:
          signal = 0

      backTest_df.loc[index, 'signal'] = signal

    return backTest_df

# Idea calculate threshold t+1 given t, t-1, t-2, t-3, t-5

In [10]:
# Calcuate the PnL of the Pairs portfolio
pairsBackTest = []

for a,b in my_trading_pairs:
    stockA, stockB = a, b

    my_tradingParams = researchTradingParams(researchData, stockA, stockB)
    my_backTest_df = backTest(testData, my_tradingParams, stockA, stockB)
    pnl, pnl_df = pairslib.calcPnl(my_backTest_df)
    pairsBackTest.append(my_backTest_df[[stockA, stockB, 'signal', 'dollarValue']])

pnl, pnlDf = pairslib.calcPortfolio(pairsBackTest)
pnlDf

BABA-SW vs TECHTRONIC IND ---> $ 6903.274135824227
BABA-SW vs YIHAI INTL ---> $ -4201.018185919497
BABA-SW vs PHARMARON ---> $ 7584.42161635362
TENCENT vs MEITUAN-W ---> $ -286.63763623058276
MEITUAN-W vs TENCENT ---> $ 370.1306188378876
PING AN vs ASM PACIFIC ---> $ 2920.0918565642987
SHK PPT vs GANFENGLITHIUM ---> $ 3712.0467892696015
SHK PPT vs LINK REIT ---> $ -1569.28980099999
ANTA SPORTS vs GANFENGLITHIUM ---> $ 153.90612333868103
CLP HOLDINGS vs HENGAN INT'L ---> $ 1784.9875626017838
SUNNY OPTICAL vs GANFENGLITHIUM ---> $ -5739.76280855715
TECHTRONIC IND vs BABA-SW ---> $ 6404.192130910105
YIHAI INTL vs BABA-SW ---> $ -495.8548428427712
HENGAN INT'L vs CLP HOLDINGS ---> $ 2580.123027147608
ASM PACIFIC vs PING AN ---> $ 2478.607119123586
PHARMARON vs BABA-SW ---> $ 7584.42161635362
GANFENGLITHIUM vs SHK PPT ---> $ 3712.0467892696015
GANFENGLITHIUM vs ANTA SPORTS ---> $ -1556.5968022074612
GANFENGLITHIUM vs SUNNY OPTICAL ---> $ -4722.728600454877
LINK REIT vs SHK PPT ---> $ 178.63

Unnamed: 0,stockA,stocksB,Pnl
0,BABA-SW,TECHTRONIC IND,6903.274136
1,BABA-SW,YIHAI INTL,-4201.018186
2,BABA-SW,PHARMARON,7584.421616
3,TENCENT,MEITUAN-W,-286.637636
4,MEITUAN-W,TENCENT,370.130619
...,...,...,...
763,FLAT GLASS,XIAOMI-W,5367.980718
764,FLAT GLASS,GENSCRIPT BIO,1023.951583
765,FLAT GLASS,FUYAO GLASS,-687.924751
766,HEC PHARM,GENSCRIPT BIO,23858.934045
