# Scrape HSI Constituents from official website

This script is used to construct a time series Hang Seng Index universe based on customized date list

Three steps are involved:

1. Scrape raw files from Hang Seng website (<a href="https://www.hsi.com.hk/eng/indexes/all-indexes/hsi">HSI Official Website</a>)
2. Clean the raw files
3. Construct a time series universe based on the cleaned raw files.

Scrapy is preferred for web scraping in most cases but not in this case because it can't easily scrape with JavaScript. In this case, we need to click 2 JavaScript buttons to download below 2 raw files.

1. Latest Constituents
2. Historical Change in HSI

With above 2 files, we could easily construct the time series universe. Cleaned and uncleaned raw files will be showing as well. 

Another key note is HK Date List. Our date list must in HK trading calendar. Only HK trading date will be considered as valid input when downloading latest constituent file from the website.

Finally, we will generate a csv file named hsiUniverseDf.

- We must put <a href="https://stackoverflow.com/questions/41190989/how-do-i-install-geckodriver">geckodriver.exe</a> into the bin folder under our python folder in order to use Firefox as a webdriver under selenium. (I'm using Mac)

In [2]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from dateutil.relativedelta import relativedelta
from datetime import date
import os
import re
import shutil
import time
import holidays
import glob

In [3]:
def getHKDateList(startDate, endDate, frequency):
    # Get dateList according to HK Calendar
    dateList = pd.date_range(start=startDate, end=endDate, freq=frequency)
    hkHolidays = holidays.HK()
    dateList = [date if date not in hkHolidays else 
                max([loopDate for loopDate in pd.date_range(start=date-relativedelta(days=14), end=date, freq='D') if 
                     loopDate not in hkHolidays and loopDate.weekday() != 5 and loopDate.weekday() != 6]) 
                for date in dateList]
    del hkHolidays
    return dateList
    
def downloadRawFiles(downloadPath):
    print(downloadPath)
    print('Start downloading HSI files to above path.')
    
    # Create dowload folder if it's not exist
    if os.path.isdir(downloadPath) == False:
        os.mkdir(downloadPath)
        print('Created ' + downloadPath + ' as it is not exist')
    
    # Clean download folder if past files exist
    oldFileList = [downloadPath + "/" + f for f in os.listdir(downloadPath) if f == 'hsiCons.csv' or f == 'hsiConsChange.xlsx']
    
    if oldFileList != []:
        for file in oldFileList:
            os.remove(file)
    
    # Get Latest Month HK Date List
    latestDate = date.today() - relativedelta(days=1)
    dateList = getHKDateList(startDate=latestDate - relativedelta(months=1), endDate=latestDate, frequency='D')
    
    # Firefox Setting -> Headless
    options = Options()
    options.headless = True
    
    # Firefox Setting -> Change download path for this task
    profile = webdriver.FirefoxProfile()
    profile.set_preference("browser.download.folderList", 2)
    profile.set_preference("browser.download.manager.showWhenStarting", False)
    profile.set_preference("browser.download.dir", downloadPath)
    profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv")
    
    # Link to the website
    driver = webdriver.Firefox(firefox_profile=profile, options=options)
    driver.get('https://www.hsi.com.hk/eng/indexes/all-indexes/hsi')
    timeout = 20
    try:
        element_present = EC.presence_of_element_located((By.XPATH, "//input[@aria-label='Calendar input field']"))
        WebDriverWait(driver, timeout).until(element_present)
        time.sleep(5)
    except TimeoutException:
        print("Timed out waiting for page to load")
        
    ##### Download Last Date Constituents CSV #####
    
    # Select report type in daily report
    selectReport = Select(driver.find_element_by_id('reportType'))
    selectReport.select_by_visible_text('Constituent Daily Performance (csv format)')
    
    # Input date
    latestDate = dateList[-1]
    inputDate = latestDate.strftime('%d%m%Y') 
    datePicker = driver.find_element(By.XPATH, "//input[@aria-label='Calendar input field']")
    datePicker.send_keys(inputDate)

    # Click download button
    downloadButton = driver.find_element(By.XPATH, "//button[@class='commonBtn' and @href='javascript:void(0);']")
    downloadButton.click()
    
    # Rename file - it should take a few seconds to download the file
    time.sleep(3)
    filename = max([downloadPath + "/" + f for f in os.listdir(downloadPath)],key=os.path.getctime)
    shutil.move(filename,os.path.join(downloadPath,"hsiCons.csv"))
    
    ##### Download Constituents Change File #####
    
    # Input blank date to disable constituents download button
    datePicker.send_keys('')
    
    # Select input in historical change of constituents
    selectInput = Select(driver.find_element_by_id('constituentsSelect'))
    selectInput.select_by_visible_text('Hang Seng Index and Sub-indexes')
    
    # Click View button
    viewButton = driver.find_element(By.XPATH, "//button[@class='commonBtn' and @href='javascript:void(0);']")
    viewButton.click()
    
    # Rename file - it should take a few seconds to download the file
    time.sleep(3)
    filename = max([downloadPath + "/" + f for f in os.listdir(downloadPath)],key=os.path.getctime)
    shutil.move(filename,os.path.join(downloadPath,"hsiConsChange.xlsx"))
    
    driver.quit()
    return print('Finished Downloading Raw Files for Date: ' + startDate + ' - ' + endDate)

# Input

In [4]:
# downloadPath = "/Users/Your Download Path"
downloadPath = "/Users/kayinman2/Desktop/Github/HS Factors/HSI Cons"
startDate = "20090930"
endDate = "20190930"
frequency = "BM"

# Download Raw Files from Web

In [5]:
downloadRawFiles(downloadPath)

/Users/kayinman2/Desktop/Github/HS Factors/HSI Cons
Start downloading HSI files to above path.
Finished Downloading Raw Files for Date: 20090930 - 20190930


# Read Raw Files and Clean Data

#### HK Date List

In [6]:
hkDateList = getHKDateList(startDate=startDate, endDate=endDate, frequency=frequency)

In [7]:
hkDateList[:5]

[Timestamp('2009-09-30 00:00:00', freq='BM'),
 Timestamp('2009-10-30 00:00:00', freq='BM'),
 Timestamp('2009-11-30 00:00:00', freq='BM'),
 Timestamp('2009-12-31 00:00:00', freq='BM'),
 Timestamp('2010-01-29 00:00:00', freq='BM')]

#### HSI Constituents

In [8]:
hsiConsRaw = pd.read_csv(downloadPath + '/hsiCons.csv', low_memory=False, encoding="utf-16")

In [9]:
hsiConsRaw.head(3)

Unnamed: 0,"交易日\t""指數""\t""股份代號""\t""股份名稱""\t""股份名稱""\t""上市交易所""\t""行業""\t""交易貨幣""\t""收市價""\t""百分比變動""\t""對指數升跌影響""\t""比重 (%)""\t""於恒生金融分類指數比重 (%)""\t""於恒生公用事業分類指數比重 (%)""\t""於恒生地產分類指數比重 (%)""\t""於恒生工商業分類指數比重 (%)"""
0,"Trade Date\t""Index""\t""Stock Code""\t""Stock Name..."
1,"20191106\t""Hang Seng Index 恒生指數""\t""0001.HK""\t""..."
2,"20191106\t""Hang Seng Index 恒生指數""\t""0002.HK""\t""..."


In [10]:
def reconstructHSICons(hsiConsRaw):
    consDf = hsiConsRaw.iloc[:,0].str.split("\t", expand = True)
    consDf = consDf.apply(lambda x: x.str.replace('"', ''))
    consDf = consDf.apply(lambda x: x.str.strip())
    
    consDf.columns = consDf.iloc[0,:]
    consDf = consDf.iloc[1:,:]
    
    return consDf

In [11]:
hsiCons = reconstructHSICons(hsiConsRaw=hsiConsRaw)

In [12]:
hsiCons.head(3)

Unnamed: 0,Trade Date,Index,Stock Code,Stock Name,Stock Name.1,Exchange Listed,Industry,Trading Currency,Closing Price,% Change,Index Point Contribution,Weighting (%),Weighting in HSI - Finance (%),Weighting in HSI - Utilities (%),Weighting in HSI - Properties (%),Weighting in HSI - Commerce & Industry (%)
1,20191106,Hang Seng Index 恒生指數,0001.HK,CKH HOLDINGS,長和,Hong Kong 香港,80 Conglomerates 綜合企業,HKD,74.75,0.95,5.74,2.21,,,,6.34
2,20191106,Hang Seng Index 恒生指數,0002.HK,CLP HOLDINGS,中電控股,Hong Kong 香港,40 Utilities 公用事業,HKD,83.3,0.06,0.29,1.73,,36.14,,
3,20191106,Hang Seng Index 恒生指數,0003.HK,HK & CHINA GAS,香港中華煤氣,Hong Kong 香港,40 Utilities 公用事業,HKD,15.34,-0.65,-3.08,1.71,,35.67,,


In [13]:
def finalHSICons(hsiCons):
    finalHSICons = hsiCons[['Trade Date', 'Stock Code', 'Stock Name', 'Industry']].copy()  
    finalHSICons['Industry'] = finalHSICons['Industry'].apply(lambda x: x.split(' ')[1])
    finalHSICons['English Name'] = finalHSICons['Stock Name'].iloc[:,0]
    finalHSICons = finalHSICons.drop(columns='Stock Name')
    finalHSICons = finalHSICons.rename(columns={'English Name': 'Stock Name'})
    finalHSICons['Trade Date'] = finalHSICons['Trade Date'].astype('datetime64[ns]')
    return finalHSICons

In [14]:
finalHSICons = finalHSICons(hsiCons=hsiCons)

In [15]:
finalHSICons.head(3)

Unnamed: 0,Trade Date,Stock Code,Industry,Stock Name
1,2019-11-06,0001.HK,Conglomerates,CKH HOLDINGS
2,2019-11-06,0002.HK,Utilities,CLP HOLDINGS
3,2019-11-06,0003.HK,Utilities,HK & CHINA GAS


In [16]:
finalHSICons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 1 to 50
Data columns (total 4 columns):
Trade Date    50 non-null datetime64[ns]
Stock Code    50 non-null object
Industry      50 non-null object
Stock Name    50 non-null object
dtypes: datetime64[ns](1), object(3)
memory usage: 1.7+ KB


#### HSI Constituents Historical Change

In [17]:
consHistChg = pd.read_excel(downloadPath + '/hsiConsChange.xlsx')

In [18]:
consHistChg.head(7)

Unnamed: 0,Constituent Changes in Hang Seng Index,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,恒生指數之成份股變動,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,Effective Date 生效日期,No. of Constituents,Change 變動,Count 數目,Stock Code 股份代號,Listing Place 上市地點,Stock Name,股份名稱
4,,(After Change) 成份股數目 (變動後),,,,,,
5,2019-03-11,50,Add 加入,+1,669,Hong Kong 香港,TECHTRONIC IND,創科實業
6,2019-03-11,50,Delete 刪除,-1,836,Hong Kong 香港,CHINA RES POWER,華潤電力


In [19]:
def finalHistChgDf(consHistChg):
    histChgDf = consHistChg.iloc[3:,:]
    histChgDf.columns = histChgDf.iloc[0,:]
    histChgDf = histChgDf[[column for column in histChgDf.columns if 
                           'Effective Date' in column or 'Change' in column or 'Stock' in column]].copy()
    histChgDf.columns = ['Effective Date' if 'Effective Date' in column else 
                         'Change' if 'Change' in column else 
                         'Stock Code' if 'Stock Code' in column else 
                         'Stock Name' if 'Stock Name' in column else None for 
                         column in histChgDf.columns]
    histChgDf = histChgDf.iloc[2:,:]
    histChgDf = histChgDf.dropna(subset=['Stock Code'])
    histChgDf['Change'] = [1 if 'Add' in change else -1 if 'Delete' in change else np.nan 
                           for change in histChgDf['Change']]
    histChgDf['Effective Date'] = histChgDf['Effective Date'].astype('datetime64[ns]')
    histChgDf['Stock Code'] = [(str(code) + '.HK').rjust(7,str(0)) for code in histChgDf['Stock Code']]
    return histChgDf

In [20]:
histChgDf = finalHistChgDf(consHistChg=consHistChg)

In [21]:
histChgDf.head()

Unnamed: 0,Effective Date,Change,Stock Code,Stock Name
5,2019-03-11,1,0669.HK,TECHTRONIC IND
6,2019-03-11,-1,0836.HK,CHINA RES POWER
7,2018-09-10,1,1177.HK,SINO BIOPHARM
8,2018-09-10,1,2313.HK,SHENZHOU INTL
9,2018-09-10,-1,0023.HK,BANK OF E ASIA


# Generate universe from Date List and Cleaned Raw Files

In [22]:
def generateUniverse(hkDateList, finalHSICons, histChgDf):
    universeDfList = []
    latestUniverseDf = finalHSICons[['Stock Code', 'Trade Date']].copy().rename(columns={'Trade Date': 'Update Date'})
    reverseDateList = sorted(hkDateList, reverse=True)
    for date in reverseDateList:
        latestDate = pd.Timestamp(latestUniverseDf['Update Date'].drop_duplicates().values[0])
        thisPeriodChgDf = histChgDf[(histChgDf['Effective Date'] >= date) & 
                                    (histChgDf['Effective Date'] <= latestDate)].copy()
        if thisPeriodChgDf.empty == True:
            latestUniverseDf['Update Date'] = date
            thisAppendDf = latestUniverseDf.copy()
            universeDfList.append(thisAppendDf)
        else:
            dropList = [stockCode for stockCode, change in 
                        np.array(thisPeriodChgDf[['Stock Code', 'Change']]) if change == 1]
            addList = [stockCode for stockCode, change in 
                        np.array(thisPeriodChgDf[['Stock Code', 'Change']]) if change == -1]
            
            latestUniverseDf = latestUniverseDf[latestUniverseDf['Stock Code'].isin(dropList) == False].copy()
            latestUniverseDf['Update Date'] = date
            addDf = pd.DataFrame.from_dict({'Stock Code': addList})
            addDf['Update Date'] = date
            latestUniverseDf = pd.concat([latestUniverseDf, addDf])
            thisAppendDf = latestUniverseDf.copy()
            universeDfList.append(thisAppendDf)
    universeDf = pd.concat(universeDfList)
    return universeDf

In [23]:
universeDf = generateUniverse(hkDateList, finalHSICons, histChgDf)

In [24]:
universeDf.head()

Unnamed: 0,Stock Code,Update Date
1,0001.HK,2019-09-30
2,0002.HK,2019-09-30
3,0003.HK,2019-09-30
4,0005.HK,2019-09-30
5,0006.HK,2019-09-30


In [25]:
universeDf.tail()

Unnamed: 0,Stock Code,Update Date
0,0001.HK,2009-09-30
0,1199.HK,2009-09-30
0,0330.HK,2009-09-30
0,2600.HK,2009-09-30
0,2038.HK,2009-09-30


In [26]:
universeDf.to_csv('hsiUniverseDf.csv', index=False)