# Build a Long/Short Pairs Portfolio to maximum the PnL

1.   Base on **stocksInfo** and **researchData** only to **identify  Pairs** and calculate **trading parameters**. No additional data is allowed
2.   Based on **testData** to **backtest** the Pairs portfolio with **signals** and **dollarValue**
3.   Calculate the **PnL** of the backtested Pairs portfolio



# Rules
 

*   **No look-ahead bias**: The testData cannot be used for Pairs identification nor the trading parameters calculations. During backtest, can only refer to **look-back indicators**
*   **No overfitting**: The selected Pairs cannot be hand-picked and must be based on rules. Similarly for the trading parameters and dollarValue, apart from the obvious numbers (e.g. 0.05, 0.1, 0.5, 1, 100, 1000 etc.), they must be based on rules also.

In [None]:
!pip install --upgrade openpyxl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller

%load_ext google.colab.data_table 
%matplotlib inline

# Download and import pairslib for calculating PnL
!wget https://github.com/kenwkliu/ideas/raw/master/colab/pairslib.py
import pairslib

# Load the stockInfo, researchData and testData
stocksInfo = pd.read_excel('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/hkStocksQuotes.xlsx')
researchData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/researchHKStocksAdjClosePx.csv', index_col=0)
testData = pd.read_csv('https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/testHKStocksAdjClosePx.csv', index_col=0)


The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_table
--2022-02-05 13:57:15--  https://github.com/kenwkliu/ideas/raw/master/colab/pairslib.py
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/pairslib.py [following]
--2022-02-05 13:57:16--  https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/pairslib.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2949 (2.9K) [text/plain]
Saving to: ‘pairslib.py.3’


2022-02-05 13:57:16 (23.7 MB/s) - ‘pairslib.py.3’ saved [2949/2949]



In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

from datetime import datetime
import numpy as np
import pandas as pd
import pandas_datareader.data as web

# Download the font to display Chinese
!wget https://github.com/kenwkliu/ideas/raw/master/colab/data/simhei.ttf
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
CNFont = FontProperties(fname='/content/simhei.ttf')

# Yahoo Finance
!pip install yfinance
import yfinance as yf

# Google colab interactive table
%load_ext google.colab.data_table 
%matplotlib inline


### Helper functions

CHART_SIZE_X, CHART_SIZE_Y = 12, 8
SMALL_CHART_SIZE_X, SMALL_CHART_SIZE_Y = 8, 6


# Plot stock pair chart
def plotPair(df, stockA, stockB, sizeX, sizeY):
  ax1 = df[stockA].plot(label=stockA, legend=True, figsize = (sizeX, sizeY))
  ax1.set_ylim(df[stockA].min(), df[stockA].max())

  ax2 = df[stockB].plot(secondary_y=True, label=stockB, legend=True, figsize = (sizeX, sizeY))
  ax2.set_ylim(df[stockB].min(), df[stockB].max())

  ax1.legend(prop=CNFont, loc=2)
  ax2.legend(prop=CNFont, loc=1)

  plt.show()

def find_cointegrated_pairs(data):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.1:
                pairs.append((keys[i], keys[j]))
    return  pairs

def gainCorrelatedPairs(researchData, df):
    correlation = []
    for i in df['stockA']:
      for j in df['stockB']:
        corr_ = researchData[i].corr(researchData[j])
        correlation.append(corr_)
    df['corr'] = pd.DataFrame(correlation)
    return df

def calparams(stockA,stockB,Data):
    Data = Data.dropna()
    Y = Data[stockA]
    X = Data[stockB]
    #x = np.linspace(0, 10, len(Data))
    X = sm.add_constant(X)
    model = sm.OLS(Y,X).fit()
    #intercept = result.params[0]
    #coef = result.params[1]
    return model.params

def calres(researchData,df):
    std = []
    researchData = researchData.dropna()
    for i in df['stockA']:
      for j in df['stockB']:    
        Y = researchData[i]
        X = researchData[j]
        X = sm.add_constant(X)
        model = sm.OLS(Y,X).fit()
        resid = model.resid
        std_ = resid.std()
        std.append(std_)
    df['std'] = pd.DataFrame(std)
    return df


# Filter the correlated stock pairs with the THRESHOLD
def getCorrelatedPairs(stocksCorr, THRESHOLD=0.95):
  # filter the pairs with correlation values above the THRESHOLD
  highCorr = stocksCorr[((stocksCorr >= THRESHOLD) & (stocksCorr < 1))]
  highCorr = highCorr.unstack().sort_values(ascending=False).drop_duplicates()
  highCorr.dropna(inplace=True)
  highCorrDf = highCorr.to_frame().reset_index()
  highCorrDf.rename(columns = {'level_0':'stockA', 'level_1':'stockB', 0:'corr'}, inplace=True)

  # looks up the sectors for the stocksA and stockB
  cols = ['stockA', 'stockB', 'corr', 'sector_A', 'sector_B']
  pairsDf = highCorrDf.merge(stocksFilteredInfo[['shortName', 'sector']], how='left', left_on='stockA', right_on='shortName').merge(stocksFilteredInfo[['shortName', 'sector']], how='left', left_on='stockB', right_on='shortName', suffixes=('_A', '_B'))[cols]
  pairsDf['sameSector'] = (pairsDf['sector_A'] == pairsDf['sector_B'])
  
  return pairsDf


### back test related functions

# based on the reserch data to determind the trading params (Enter/Exit Points)
def researchTradingParams(researchData, stockA, stockB, dollarValue=20000):
  cols = [stockA, stockB]
  research_df = researchData[cols].copy()
  research_df.dropna(inplace = True)

  tradingParams = {}
  tradingParams['dollarValue'] = dollarValue

  # Calculate avgPxRatio for Exit (convergence)
  research_df['ratio'] = research_df[stockA] / research_df[stockB]
  avgPxRatio = research_df['ratio'].mean()
  tradingParams['avgPxRatio'] = avgPxRatio

  # Calculate shortA_longB_ratio for Entry (Divergence)
  shortA_longB_ratio = avgPxRatio + 0.75*np.percentile(research_df['ratio'],25)
  tradingParams['shortA_longB_ratio'] = shortA_longB_ratio

  # Calculate longA_shortB_ratio from Entry (Divergence)
  longA_shortB_ratio = avgPxRatio - 0.35*np.percentile(research_df['ratio'],25)
  tradingParams['longA_shortB_ratio'] = longA_shortB_ratio

  return tradingParams


# Determind the signal and dollarValue in the test data
# signal == -1: Long stockA Short stockB
# signal == 1: Short stockA Long stockB
# signal == 0: flat position
def backTest(testData, tradingParams, stockA, stockB):
  cols = [stockA, stockB]
  backTest_df = testData[cols].copy()
  backTest_df.dropna(inplace = True)

  # Get the tradingParams
  dollarValue = tradingParams['dollarValue']
  avgPxRatio = tradingParams['avgPxRatio']
  shortA_longB_ratio = tradingParams['shortA_longB_ratio']
  longA_shortB_ratio = tradingParams['longA_shortB_ratio']

  # Calculate the Price ratio in backTest_df
  backTest_df['pxRatio'] = backTest_df[stockA] / backTest_df[stockB]
  backTest_df['dollarValue'] = dollarValue
  
  # initialize the signal to 0
  backTest_df['signal'] = 0
  signal = 0

  # Determine the signal in each row of the backTest_df
  for index, row in backTest_df.iterrows():
    pxRatio = row['pxRatio']

    # mark signal = 1 if pxRatio > shortA_longB_ratio (Diverge outside the upper band)
    if pxRatio > shortA_longB_ratio:
      signal = 1

    # mark signal = -1 if pxRatio < longA_shortB_ratio (Diverge outside the lower band)
    elif pxRatio < longA_shortB_ratio:
      signal = -1

    else:
      # continue to mark signal = 1 if previous signal == 1 and pxRatio > avgPxRatio (Trade entered but not converge back yet)
    

      if signal == 1 and pxRatio > avgPxRatio :
        signal = 1


      # continue to mark signal = -1 if previous signal == -1 and pxRatio < avgPxRatio (Trade entered but not converge back yet)
     
      elif signal == -1 and   pxRatio < avgPxRatio:
        signal = -1

      else:
        signal = 0

    backTest_df.loc[index, 'signal'] = signal

  return backTest_df


# determine pSignal and nSignal for up/down markers in plot
# pSignal and nSignal is for displaying the up/down markers in plotting chart only, they're not required for backtest calculation
def addSignalMarker(backTest_df):
  backTest_df['pSignal'] = np.where(backTest_df['signal'] == 1, backTest_df['pxRatio'], np.nan)
  backTest_df['nSignal'] = np.where(backTest_df['signal'] == -1, backTest_df['pxRatio'], np.nan)

  return backTest_df


# Combine the research and backtest for a Portfolio of Pairs
def researchAndBackTestPortfolio(pairsDf, researchData, testData, printOutput=True):
  pairsPortfolioBackTest = []

  for index, row in pairsDf.iterrows():
    stockA, stockB = row['stockA'], row['stockB']
    if printOutput: print(stockA, 'vs', stockB)
    tradingParams = researchTradingParams(researchData, stockA, stockB)
    pairsPortfolioBackTest.append(backTest(testData, tradingParams, stockA, stockB)[[stockA, stockB, 'signal', 'dollarValue']])

  return pairsPortfolioBackTest


# Download and import pairslib for calculating PnL
!wget https://github.com/kenwkliu/ideas/raw/master/colab/pairslib.py
import pairslib

--2022-02-05 13:57:33--  https://github.com/kenwkliu/ideas/raw/master/colab/data/simhei.ttf
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/simhei.ttf [following]
--2022-02-05 13:57:33--  https://raw.githubusercontent.com/kenwkliu/ideas/master/colab/data/simhei.ttf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9751960 (9.3M) [application/octet-stream]
Saving to: ‘simhei.ttf.2’


2022-02-05 13:57:33 (72.8 MB/s) - ‘simhei.ttf.2’ saved [9751960/9751960]

The google.colab.data_table extension is already loaded. To reload it, use:
  %reload_ext google.colab.data_tab

In [None]:
stocksInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2735 entries, 0 to 2734
Data columns (total 70 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   code                               2735 non-null   object 
 1   shortName                          2699 non-null   object 
 2   industry                           2735 non-null   object 
 3   sector                             2732 non-null   object 
 4   language                           2735 non-null   object 
 5   region                             2735 non-null   object 
 6   quoteType                          2735 non-null   object 
 7   quoteSourceName                    2735 non-null   object 
 8   triggerable                        2735 non-null   bool   
 9   currency                           2735 non-null   object 
 10  marketState                        2735 non-null   object 
 11  exchange                           2735 non-null   objec

In [None]:
researchData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 249 entries, 2020-01-02 to 2020-12-31
Columns: 173 entries, BABA-SW to LINK REIT
dtypes: float64(173)
memory usage: 338.5+ KB


In [None]:
# list the unique industries names
industryNames = stocksInfo['industry'].unique()
print('Count:', len(industryNames))
print(industryNames)

Count: 23
['Information Technology' 'Financial' 'Telecommunications' 'Energy'
 'Food and Beverage' 'Real Estate Construction' 'Meida and Communications'
 'Medicine' 'Betting' 'Transportation, logistics and infrastructure'
 'Consumer Goods Manufacturing' 'Conglomerate' 'Retail' 'Electricity'
 'Automobile production and distribution' 'Industry'
 'Environmental protection' 'Consumer Service' 'Industrial Products'
 'Raw Materials' 'Index leveraged products' 'Fund' 'ETF']


In [None]:
# list the unique sector names
sectorNames = stocksInfo['sector'].unique()
print('Count:', len(sectorNames))
print(sectorNames)

Count: 123
['Telecommunications Equipment' 'Semi-conductor' 'Technology and Software'
 'E-commerce and Internet' 'Online and Mobile Games'
 'State-owned China banks' 'China Telecom' 'International Insurance'
 'Bank' 'China Insurance' 'Other financial services' 'Oil and gas'
 'Alcoholic Beverages' 'Real Estate Development'
 'Media Entertainment Culture Publishing' 'Pharmaceutical'
 'China Properties' 'Macau Gaming' 'Public Transport' 'Toys'
 'Conglomerate' 'Biotechnology' 'Sporting Goods'
 'Gas and Public Utilities' 'Catering' 'Electric utility' 'Supermarket'
 'small and medium China banks' 'Automobile' 'Mobile phone related'
 'Machinery' 'Dairy related' 'Textile and Clothing' 'Healthcare'
 'Property Management and Agent' 'Auto sales' 'Infrastructure'
 'Agricultural and Fishery products'
 'Food and beverage Production and Wholesale' 'Local Telecom'
 'Real Estate Investment' 'Water utilities' 'Jewelry Watches' 'Snack Food'
 'Cement' 'Clothing Accessories' 'Photovoltaic Solar' 'Packaged f

In [None]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 184 entries, 2021-01-04 to 2021-09-29
Columns: 173 entries, BABA-SW to LINK REIT
dtypes: float64(173)
memory usage: 250.1+ KB


2. Stock Selection:

Choose the set of the stocks that wer are interested to trade in our portfolios

In [None]:
p = find_cointegrated_pairs(researchData.dropna())

In [None]:
dfpair = pd.DataFrame(p,columns = ['stockA', 'stockB'])

In [None]:
cols = ['stockA', 'stockB', 'sector_A', 'sector_B']
dfpair = dfpair.merge(stocksInfo[['shortName', 'sector']], how='left', left_on='stockA', right_on='shortName').merge(stocksInfo[['shortName', 'sector']], how='left', left_on='stockB', right_on='shortName', suffixes=('_A', '_B'))[cols]

In [None]:
dfpair['sameSector'] = (dfpair['sector_A'] == dfpair['sector_B'])

In [None]:
same = dfpair[dfpair['sameSector']==1]
same = same.reset_index(drop=True)

In [None]:
same

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector
0,CCB,ICBC,State-owned China banks,State-owned China banks,True
1,CCB,ABC,State-owned China banks,State-owned China banks,True
2,HSBC HOLDINGS,HANG SENG BANK,Bank,Bank,True
3,ICBC,ABC,State-owned China banks,State-owned China banks,True
4,EVERGRANDE,LONGFOR GROUP,Real Estate Development,Real Estate Development,True
5,ALI HEALTH,CSPC PHARMA,Pharmaceutical,Pharmaceutical,True
6,ALI HEALTH,UNITED LAB,Pharmaceutical,Pharmaceutical,True
7,CHINA OVERSEAS,SUNAC,China Properties,China Properties,True
8,HANG SENG BANK,BOC HONG KONG,Bank,Bank,True
9,BANK OF CHINA,ABC,State-owned China banks,State-owned China banks,True


In [None]:
gainCorrelatedPairs(researchData.dropna(),same)

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr
0,CCB,ICBC,State-owned China banks,State-owned China banks,True,0.954065
1,CCB,ABC,State-owned China banks,State-owned China banks,True,0.93918
2,HSBC HOLDINGS,HANG SENG BANK,Bank,Bank,True,0.674302
3,ICBC,ABC,State-owned China banks,State-owned China banks,True,0.93918
4,EVERGRANDE,LONGFOR GROUP,Real Estate Development,Real Estate Development,True,-0.196241
5,ALI HEALTH,CSPC PHARMA,Pharmaceutical,Pharmaceutical,True,-0.283774
6,ALI HEALTH,UNITED LAB,Pharmaceutical,Pharmaceutical,True,-0.47226
7,CHINA OVERSEAS,SUNAC,China Properties,China Properties,True,0.478319
8,HANG SENG BANK,BOC HONG KONG,Bank,Bank,True,0.616108
9,BANK OF CHINA,ABC,State-owned China banks,State-owned China banks,True,0.93918


In [None]:
same['corr'].describe()

count    30.000000
mean      0.284494
std       0.399720
min      -0.472260
25%       0.004323
50%       0.313982
75%       0.510491
max       0.954065
Name: corr, dtype: float64

In [None]:
sel_same = same[( (same['corr']>=0.348184))]

In [None]:
sel_same

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr
0,CCB,ICBC,State-owned China banks,State-owned China banks,True,0.954065
1,CCB,ABC,State-owned China banks,State-owned China banks,True,0.93918
2,HSBC HOLDINGS,HANG SENG BANK,Bank,Bank,True,0.674302
3,ICBC,ABC,State-owned China banks,State-owned China banks,True,0.93918
7,CHINA OVERSEAS,SUNAC,China Properties,China Properties,True,0.478319
8,HANG SENG BANK,BOC HONG KONG,Bank,Bank,True,0.616108
9,BANK OF CHINA,ABC,State-owned China banks,State-owned China banks,True,0.93918
11,SINO BIOPHARM,3SBIO,Pharmaceutical,Pharmaceutical,True,0.382754
13,CHINA LIFE,PICC P&C,China Insurance,China Insurance,True,0.802969
17,NEWWORLDDEV-NEW,CHINA JINMAO,Real Estate Development,Real Estate Development,True,0.420359


In [None]:
#print(sel_same)

In [None]:
dif = dfpair[dfpair['sameSector']==0]
dif = dif.reset_index(drop=True)

In [None]:
gainCorrelatedPairs(researchData,dif)

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr
0,BABA-SW,CK ASSET,E-commerce and Internet,Real Estate Development,False,-0.452740
1,BABA-SW,WH GROUP,E-commerce and Internet,Agricultural and Fishery products,False,-0.380816
2,BABA-SW,CHINA RES GAS,E-commerce and Internet,Gas and Public Utilities,False,-0.698156
3,BABA-SW,GUANGDONG INV,E-commerce and Internet,Water utilities,False,-0.745080
4,BABA-SW,PETROCHINA,E-commerce and Internet,Oil and gas,False,-0.326450
...,...,...,...,...,...,...
1532,ZHAOJIN MINING,LINK REIT,Precious metals,REIT,False,-0.350250
1533,HKTV,HEC PHARM,Film and TV,Pharmaceutical,False,-0.703022
1534,COMEC,EB SECURITIES,Shipbuilding related,Other financial services,False,0.695398
1535,EB SECURITIES,LINK REIT,Other financial services,REIT,False,-0.350250


In [None]:
dif['corr'].describe()

count    1537.000000
mean        0.313551
std         0.502998
min        -0.745080
25%        -0.081778
50%         0.428448
75%         0.751079
max         0.906888
Name: corr, dtype: float64

In [None]:
sel_dif = dif[ ( (dif['corr']>= 0.751079))]

In [None]:
sel_dif

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr
10,BABA-SW,HENGTEN NET,E-commerce and Internet,Photovoltaic Solar,False,0.906109
17,TENCENT,ZHONGSHENG HLDG,Online and Mobile Games,Auto sales,False,0.845569
23,TENCENT,CHINA SHENHUA,Online and Mobile Games,Coal industry related,False,0.756483
24,TENCENT,ND PAPER,Online and Mobile Games,Paper stocks,False,0.775353
32,TENCENT,HENGTEN NET,Online and Mobile Games,Photovoltaic Solar,False,0.906109
...,...,...,...,...,...,...
1524,ZHAOJIN MINING,COMEC,Precious metals,Shipbuilding related,False,0.751079
1526,ZHAOJIN MINING,CHINA DONGXIANG,Precious metals,Sporting Goods,False,0.874980
1527,ZHAOJIN MINING,FLAT GLASS,Precious metals,Glass strands,False,0.779549
1529,ZHAOJIN MINING,BROAD HOMES,Precious metals,Machinery,False,0.863473


In [None]:
final = pd.concat([sel_same,sel_dif],axis = 0).reset_index(drop = True)

In [None]:
final

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr
0,CCB,ICBC,State-owned China banks,State-owned China banks,True,0.954065
1,CCB,ABC,State-owned China banks,State-owned China banks,True,0.939180
2,HSBC HOLDINGS,HANG SENG BANK,Bank,Bank,True,0.674302
3,ICBC,ABC,State-owned China banks,State-owned China banks,True,0.939180
4,CHINA OVERSEAS,SUNAC,China Properties,China Properties,True,0.478319
...,...,...,...,...,...,...
401,ZHAOJIN MINING,COMEC,Precious metals,Shipbuilding related,False,0.751079
402,ZHAOJIN MINING,CHINA DONGXIANG,Precious metals,Sporting Goods,False,0.874980
403,ZHAOJIN MINING,FLAT GLASS,Precious metals,Glass strands,False,0.779549
404,ZHAOJIN MINING,BROAD HOMES,Precious metals,Machinery,False,0.863473


In [None]:
final = calres(researchData,final)

In [None]:
'''
# Pick 2 stocks price chart to to show the price correlation in research period
stockA = 'EVERGRANDE'
stockB = 'TECHTRONIC IND'
plotPair(researchData, stockA, stockB, CHART_SIZE_X, CHART_SIZE_Y)
'''

"\n# Pick 2 stocks price chart to to show the price correlation in research period\nstockA = 'EVERGRANDE'\nstockB = 'TECHTRONIC IND'\nplotPair(researchData, stockA, stockB, CHART_SIZE_X, CHART_SIZE_Y)\n"

In [None]:
final['std'].describe()

count    406.000000
mean       0.288973
std        0.026953
min        0.093562
25%        0.276227
50%        0.292601
75%        0.308839
max        0.312287
Name: std, dtype: float64

In [None]:
test = final[(final['std']<0.276227)] #思路 仍然是追求方差比较小的 #上四分位数

In [None]:
test

Unnamed: 0,stockA,stockB,sector_A,sector_B,sameSector,corr,std
0,CCB,ICBC,State-owned China banks,State-owned China banks,True,0.954065,0.093562
1,CCB,ABC,State-owned China banks,State-owned China banks,True,0.939180,0.107248
2,HSBC HOLDINGS,HANG SENG BANK,Bank,Bank,True,0.674302,0.230612
3,ICBC,ABC,State-owned China banks,State-owned China banks,True,0.939180,0.107248
4,CHINA OVERSEAS,SUNAC,China Properties,China Properties,True,0.478319,0.274248
...,...,...,...,...,...,...,...
399,MEILAN AIRPORT,CHINA DONGXIANG,Aviation Services,Sporting Goods,False,0.874980,0.271492
400,ZHAOJIN MINING,HKTV,Precious metals,Film and TV,False,0.877041,0.276227
402,ZHAOJIN MINING,CHINA DONGXIANG,Precious metals,Sporting Goods,False,0.874980,0.271492
404,ZHAOJIN MINING,BROAD HOMES,Precious metals,Machinery,False,0.863473,0.247957


In [None]:
# Selects the Pairs with same sectors into the portfolio
selectedPairsDf = test
print("Total pairs with same sectors for backtesting:", len(selectedPairsDf))
print('----------------------------------------------------------')

# Research the trading params and back test the selected Pairs in test period
pairsPortfolioBackTest = researchAndBackTestPortfolio(selectedPairsDf, researchData, testData, printOutput=False)

# look at one of the pairs backtest results
# Each backtested Pairs is a dataframe with these columns (Date, stockA, stockB, signal, dollarValue)
#pairsPortfolioBackTest[0]

# Calcuate the PnL of the Pairs portfolio
pnl, pnlDf = pairslib.calcPortfolio(pairsPortfolioBackTest)
pnlDf

Total pairs with same sectors for backtesting: 104
----------------------------------------------------------
CCB vs ICBC ---> $ 0
CCB vs ABC ---> $ 0
HSBC HOLDINGS vs HANG SENG BANK ---> $ 0
ICBC vs ABC ---> $ 0
CHINA OVERSEAS vs SUNAC ---> $ 457.59002521525326
HANG SENG BANK vs BOC HONG KONG ---> $ 0
BANK OF CHINA vs ABC ---> $ 0
CHINA LIFE vs PICC P&C ---> $ -999.0993308214165
HAITONG SEC vs CGS ---> $ 0
MEITUAN-W vs HKTV ---> $ 0
CNOOC vs HAIDILAO ---> $ 12739.652385828493
CNOOC vs BYD ELECTRONIC ---> $ 13472.87304447143
CNOOC vs ESR ---> $ 10046.805792996456
CNOOC vs HKTV ---> $ 9616.737598459891
EVERGRANDE vs HAIDILAO ---> $ -5552.33167677918
EVERGRANDE vs BYD ELECTRONIC ---> $ -9907.833953651327
EVERGRANDE vs ESR ---> $ -13333.9012051263
ALI HEALTH vs BROAD HOMES ---> $ 19929.39641753295
HANG SENG BANK vs BROAD HOMES ---> $ 11874.327372207104
CHINA RES LAND vs BROAD HOMES ---> $ 12723.833968978519
BANK OF CHINA vs YIHAI INTL ---> $ 5549.326157756494
COUNTRY GARDEN vs HAIDILAO --

Unnamed: 0,stockA,stocksB,Pnl
0,CCB,ICBC,0.000000
1,CCB,ABC,0.000000
2,HSBC HOLDINGS,HANG SENG BANK,0.000000
3,ICBC,ABC,0.000000
4,CHINA OVERSEAS,SUNAC,457.590025
...,...,...,...
99,MEILAN AIRPORT,CHINA DONGXIANG,16039.945335
100,ZHAOJIN MINING,HKTV,-5727.054562
101,ZHAOJIN MINING,CHINA DONGXIANG,-3958.254574
102,ZHAOJIN MINING,BROAD HOMES,7640.301677


In [None]:
# Implement your logic to construct "pairsPortfolioBackTest"
# pairsPortfolioBackTest needs to be same format as in
# https://colab.research.google.com/github/kenwkliu/ideas/blob/master/colab/HKStocksCorrelation.ipynb
# It is a list of backtested Pairs
# Each backtested Pairs is a dataframe with at least 5 columns (Date, stockA, stockB, signal, dollarValue)

# signal is -1, 0, 1
# signal == -1: Long stockA Short stockB
# signal == 1: Short stockA Long stockB
# signal == 0: flat position
# dollarValue if not specified, it will default to 10000. 
# Can customize each pair with different dollarValue and the allowable range is 5000 to 20000. 

In [None]:
pairsPortfolioBackTest

[                 CCB      ICBC  signal  dollarValue
 Date                                               
 2021-01-04  5.402717  4.540923       0        20000
 2021-01-05  5.383990  4.503472       0        20000
 2021-01-06  5.393354  4.512835       0        20000
 2021-01-07  5.505715  4.597099       0        20000
 2021-01-08  5.552532  4.643913       0        20000
 ...              ...       ...     ...          ...
 2021-09-21  5.400000  4.210000       0        20000
 2021-09-23  5.480000  4.230000       0        20000
 2021-09-24  5.420000  4.190000       0        20000
 2021-09-27  5.410000  4.190000       0        20000
 2021-09-28  5.410000  4.240000       0        20000
 
 [183 rows x 4 columns],                  CCB       ABC  signal  dollarValue
 Date                                               
 2021-01-04  5.402717  2.606579       0        20000
 2021-01-05  5.383990  2.615855       0        20000
 2021-01-06  5.393354  2.652959       0        20000
 2021-01-07  5.50571