In [0]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from datetime import datetime

import numpy as np
import pandas as pd
import pandas_datareader.data as web

# webscraping library
import requests
from bs4 import BeautifulSoup

# Download the font to display Chinese
!wget https://github.com/kenwkliu/ideas/raw/master/colab/data/simhei.ttf
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
%matplotlib inline
CNFont = FontProperties(fname='/content/simhei.ttf')


In [0]:
# Use Webscraping to extract HK stock codes and names from wiki web site
hk_url = 'https://zh-yue.wikipedia.org/wiki/%E9%A6%99%E6%B8%AF%E4%B8%8A%E5%B8%82%E5%85%AC%E5%8F%B8%E4%B8%80%E8%A6%BD'
html = requests.get(hk_url)
soup = BeautifulSoup(html.text, 'html.parser')
soup

In [0]:
# Read the tags line by line and scrape the stock code and names
code = []
name = []

a_tags = soup.find("div", attrs={"id":"mw-content-text"})
all_li = a_tags.find_all("li", attrs={"class":""})

for li in all_li:
    content = li.text.strip()
    code.append(str(content[:4]))
    name.append(content[4:].strip())
    
df = pd.DataFrame(index=code, data=name)
df = df.reset_index()
df.columns = ["Code", "Name"]
df

In [0]:
# download the stock price in 2019 from Yahoo
NUM_OF_STOCKS = 100
#NUM_OF_STOCKS = len(df) # All stocks
STOCK_SUFFIX = '.HK'
SOURCE = 'yahoo'
start = '2019' # accepts strings
end = datetime(2019, 12, 31) # or datetime objects

columnNames = []
stocksDownload = []
for code, name in zip(df.head(NUM_OF_STOCKS).Code, df.head(NUM_OF_STOCKS).Name):
  try:
    stocksDownload.append(web.DataReader(code + STOCK_SUFFIX, SOURCE, start=start, end=end))
    columnNames.append(name)
    print("Completed:", name)
  except Exception as e:
    print(name, ": error:", e)

print("All Completed")    

In [0]:
index = 0
print(columnNames[index])
stocksDownload[index]

In [0]:
# Only use "Adj Close"
adjCloses = [stock['Adj Close'] for stock in stocksDownload]

# Combine the stocks adjusted closes to one pandas
stocks = pd.concat(adjCloses, axis=1)
stocks.columns = columnNames
stocks

In [0]:
# Run stock correlation
stocksCorr = stocks.corr()
stocksCorr.style.background_gradient(cmap='coolwarm', axis=None)

In [0]:
# Filter the correlated stocks pair with the THRESHOLD
THRESHOLD = 0.97
highCorr = np.where(((stocksCorr >= THRESHOLD) & (stocksCorr < 1)))

# Plot the highly correlated stock pairs 
pairs = {}

for i in range(len(highCorr[0])):
  a, b = highCorr[0][i], highCorr[1][i]
  revKey = str(b) + str(a)

  # prevent duplicated plot as (a,b) pair is the same as (b,a) pair
  if revKey not in pairs:
    key = str(a) +str(b)
    pairs[key] = [a, b]
    stocks[[columnNames[a], columnNames[b]]].plot()
    plt.legend(prop = CNFont)

In [0]:
# Can be used for Pair strategy
# Calculate the average price ratio of the Pair 
# Trade if the current price ratio is significantly different