Hong Kong Monthly Digest of Statistics Helper
====

This notebook makes analysis of monthly digest of statistics from Hong Kong Census and Statistics Department easier.

Specifically, it:

    a) pulls the statistics from the 11 monthly CSVs, from all available monthly releases, into one dataset
    b) works out which statistics have a monthly frequency
    c) collects all the available monthly CSV datasets to make longer monthly analysis possible
    d) it grabs the latest edition to check if its figures match previous editions
    c) instantly shows the largest movements for the current month
    
Caveats:

    *it ignores figures that aren't monthly and datasets with percentages
    *statistics codes deeper than "class1" are excluded
    *statistics with a date range are also excluded
    *some datasets like imports from Vietnam and Philippines have no 2020 data



In [1]:
# a) pulls the statistics from the 11 monthly CSVs, from all available monthly releases, into one dataset

import requests
from bs4 import BeautifulSoup

page = "https://www.censtatd.gov.hk/hkstat/sub/sp140.jsp?productCode=B1010002"
req = requests.get(page)
soup = BeautifulSoup(req.content)
df = soup.find("table",id="df")
links = df.find_all("a")
csvs = set()
for link in links: # start from the second link to remove the duplicate
    if link.text == 'CSV Datasets':
        csvs.update([link.get('href')])
for link in links: # start from the second link to remove the duplicate
    if link.text == 'CSV Datasets':
        latest = link.get('href').replace("/fd.jsp?file=","").replace(".zip&product_id=B1010002&lang=1","")
        print(latest)
        break

import datetime
import zipfile

# create folders for today's downloads
today = datetime.datetime.now().strftime("%Y-%m-%d") 
!mkdir download
!mkdir data
!mkdir download/$today
!mkdir data/$today

def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

for csv in csvs:
    name = csv.replace("/fd.jsp?file=","").replace(".zip&product_id=B1010002&lang=1","")
    download_url("https://www.censtatd.gov.hk"+csv,"download/"+today+"/"+name+".zip")
    
    with zipfile.ZipFile("download/"+today+"/"+name+".zip", 'r') as zip_ref:
        zip_ref.extractall("data/"+today+"/"+name)


B10100022020MM12B
mkdir: download: File exists
mkdir: data: File exists
mkdir: download/2021-01-03: File exists
mkdir: data/2021-01-03: File exists


In [144]:
# start here if you downloaded the files earlier today
import datetime
import pandas as pd

today = datetime.datetime.now().strftime("%Y-%m-%d") 

folders = !cd data/$today;ls

import pandas as pd
bigdf = pd.DataFrame()
bigdesc = pd.DataFrame()

for folder in folders:
    files = !cd data/$today/$folder;ls
    dfiles = []
    for file in files:
        if "master" in file:
            dfiles.append(file)
    for file in dfiles:
        #print(folder,file)
        df = pd.read_csv("data/%s/%s/%s"%(today,folder,file))
        df['file'] = file
        df['month'] = folder
        bigdf = bigdf.append(df)
        bigdf['period'] = bigdf['period'].astype(str)
        desc = pd.read_csv("data/%s/%s/%s"%(today,folder,file.replace("master","description")))
        desc['file'] = file
        desc['month'] = folder
        bigdesc = bigdesc.append(desc)

bigdesc = bigdesc.reset_index()
bigdesc = bigdesc.drop_duplicates(subset='code')
bigdesc = bigdesc.set_index('code')['desc_e']
descdict = bigdesc.to_dict()

month = bigdf[bigdf['period'].isin(['01','02','03','04','05','06','07','08','09','10','11','12'])]
month = month[month['year'].notnull()]
check = pd.DataFrame(month.groupby(['stat','class1_code','class2_code','class3_code','class4_code','file'],dropna=False).size())
checked = check[check[0]==15]
checked.columns = ['count']
checked = checked.reset_index()
mstats = checked['stat'].unique()

month = month.sort_values('file',ascending=False)
month = month[month['stat'].isin(mstats)]
month = month.drop_duplicates(subset=['stat','year','period','class1_code','class2_code','class3_code','class4_code'])
month['indexcol'] = pd.to_datetime(month['year'].astype(int).astype(str) + ['-']*len(month) + month['period'].astype(int).astype(str) + ['-01']*len(month))
month = month.set_index('indexcol')

In [None]:
!mkdir charts
!mkdir charts/$today
!mkdir charts/$today/1mth
!mkdir charts/$today/12mth
!mkdir charts/$today/24mth


import numpy as np
import matplotlib.pyplot as plt
for i,(label,group) in enumerate(month.groupby(['stat','class1_code','class2_code','class3_code','class4_code'],dropna=False)):
    neatlabel = ""
    for part in label:
        if pd.isnull(part) == True:
            pass
        else:
            neatlabel+=descdict[part.replace("Conttype_01","conttype_01")\
                                .replace("Conttype_02","conttype_02")\
                               .replace("Conttype_tt","conttype_tt")]+"\n"
    neatlabel = neatlabel[:-1].replace("/","|")
    
    if "%" not in neatlabel:
        if "Percentage" not in neatlabel:
            print(neatlabel)
            fig = plt.figure()
            #display(label,group[['year','period','figure','file']].values)
            group['indexcol'] = pd.to_datetime(group['year'].astype(int).astype(str) + ['-']*len(group) + group['period'] + ['-01']*len(group))
            group = group.set_index('indexcol')
            group = group.replace("‡",np.nan)
            group = group.replace("〜",np.nan)
            group = group.replace("***",np.nan)
            group = group.replace("N.A.",np.nan)
            group = group.replace("§",0)
            group = group.replace("+§",0)
            group = group.replace("-§",0)
            group = group.sort_index()[['figure']]
            group['figure'] = group['figure'].astype(float,errors='ignore')
            group['3mth rav'] = group['figure'].rolling(3).mean()
            group['mth ch'] = group['figure'].diff(1)
            group['mth ch%'] = (group['mth ch'])/group['figure'].shift(1)*100
            group['12mth ch'] = group['figure'].diff(12)
            group['12mth ch%'] = (group['12mth ch'])/group['figure'].shift(12)*100
            group['24mth ch'] = group['figure'].diff(24)
            group['24mth ch%'] = (group['24mth ch'])/group['figure'].shift(24)*100
            if " " in neatlabel:
                #display(label,group)
                if len(group)>0:
                    try:
                        if int(group[-1:]['mth ch%'].values[0])<0:
                            change = str(int(group[-1:]['mth ch%'].values[0])).zfill(3)
                        else:
                            change = str(int(group[-1:]['mth ch%'].values[0])).zfill(2)
                        if int(group[-1:]['12mth ch%'].values[0])<0:
                            change12 = str(int(group[-1:]['12mth ch%'].values[0])).zfill(3)
                        else:
                            change12 = str(int(group[-1:]['12mth ch%'].values[0])).zfill(2)
                        if int(group[-1:]['24mth ch%'].values[0])<0:
                            change24 = str(int(group[-1:]['24mth ch%'].values[0])).zfill(3)
                        else:
                            change24 = str(int(group[-1:]['24mth ch%'].values[0])).zfill(2)

                        group['figure'].plot(title=neatlabel)
                        plt.tight_layout()
                        plt.savefig("charts/"+today+"/1mth/"+change+"pp | "+neatlabel+".png")
                        plt.savefig("charts/"+today+"/12mth/"+change12+"pp | "+neatlabel+".png")
                        plt.savefig("charts/"+today+"/24mth/"+change24+"pp | "+neatlabel+".png")
                        plt.close(fig)
                    except Exception as e:
                        print('ERROR',e)
        if i == 10:
            pass

mkdir: charts: File exists
Registered births
Male
Registered births
Female
Registered births
Total
Registered deaths (note 1)(note 2)
Male
Registered deaths (note 1)(note 2)
Female
Registered deaths (note 1)(note 2)
Total
Registered marriages (note 1)
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Concretor
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Bricklayer
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Drainlayer
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Mason
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Bar bender and fixer
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Metal worker
Average daily wages of workers engaged in Public Sector Construction Projects [$]
General welder
Average daily wages of workers engaged in Public Sector Construction Projects [$]
Struc

Composite Consumer Price Index (October 2014 to September 2015=100)
Miscellaneous services (note 3)
Composite Consumer Price Index (October 2014 to September 2015=100)
Miscellaneous services (note 3) - Educational services
Composite Consumer Price Index (October 2014 to September 2015=100)
Miscellaneous services (note 3) - Information and communications services
Composite Consumer Price Index (October 2014 to September 2015=100)
Miscellaneous services (note 3) - Medical services
Consumer Price Index (A) (October 2014 to September 2015=100)
All items
Consumer Price Index (A) (October 2014 to September 2015=100)
Food
Consumer Price Index (A) (October 2014 to September 2015=100)
Food - Meals bought away from home
Consumer Price Index (A) (October 2014 to September 2015=100)
Food - Food, excluding meals bought away from home
Consumer Price Index (A) (October 2014 to September 2015=100)
Housing (note 1)
Consumer Price Index (A) (October 2014 to September 2015=100)
Housing (note 1) - Private