### This notebook covers data scraping, cleaning, processing and updating of the [Public Places Visited by Singapore Covid-19 Cases dataset](https://data.world/hxchua/public-places-visited-in-singapore-by-covid-19-cases).

It updates the dataset by comparing the previous day's PDF with the current day's, and adding the newly added places to the existing dataset. 

Steps to run:
1. Update date (This day is used to compare with the previous day to find newly added places) 

Additional steps if you want to update a Google Sheet:
1. Start a Google Cloud Platform Project. Get/download the service account credentials, copy the downloaded json into the same directory as this file and update the `KEYS_FILE` variable with the json name.
2. Update Config for Google Sheets
3. Set `update_google_sheets` to True

## !!! CHANGE THIS !!!

In [1]:
date = "25 Jun 2021"
update_google_sheets = False # if True, will update Google Sheet directly

### Config for Google Sheets
(Only required if update_google_sheets is True)

In [2]:
SPREADSHEET_ID = '1KIyCmEaum6YBzlri4vc-aG1EnLhowqv2MelaqtPudVg'
KEYS_FILE = 'project-gsheets-c30a9f47e838.json'
SHEET_NAME = "Sheet1"
SCOPES = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive'] # If modifying these scopes, delete the file token.pickle.

### More options

In [3]:
landing_page_url = 'https://www.moh.gov.sg/covid-19/past-updates'
date_tag = "span"
secondary_link_string_to_find = "New Cases of Locally Transmitted COVID-19 Infection"
pdf_string_to_find = "Annexes"
pdf_table_loc_string_to_find = "Location (Address)"
table_column_index_to_select = [0,3,6]
raw_table_header = ['Date', 'Time', 'Location (Address)']
year = '2021'
data_dot_world_url = 'https://query.data.world/s/7xnvegtdm5t2zx4rvqmqwk3eaizs6m' # raw data to append new places to

## Import required packages

In [4]:
import sys
from bs4 import BeautifulSoup
import requests
import urllib3
import io
import pandas as pd
from datetime import datetime, timedelta
import re
from google.oauth2.service_account import Credentials

try: 
    import gspread
except ModuleNotFoundError:
    !{sys.executable} -m pip install gspread
    import gspread
    
try: 
    import pdfplumber
except ModuleNotFoundError:
    !{sys.executable} -m pip install pdfplumber    
    import pdfplumber

## Web Scraping

In [5]:
def get_secondary_page_url(date_string):
    landing_page = requests.get(landing_page_url)
    landing_page_soup = BeautifulSoup(landing_page.text, "html.parser")
    
    for linebreak in landing_page_soup.find_all('br'):
        linebreak.extract()
        
    date_elems_list = landing_page_soup.find_all(date_tag, text=re.compile('^.*{}.*$'.format(date_string)))
    
    entries_list = [elem.find_parent("tr") for elem in date_elems_list if secondary_link_string_to_find in str(elem.find_parent("tr"))]
    
    if len(entries_list) < 2:
        return None
    
#     if len(entries_list) >= 2:
    entry = entries_list[0]

    secondary_page_url = entry.find(href=True).get("href")
    print(secondary_page_url)
    
    return secondary_page_url.strip()

In [6]:
def get_pdf_link(date_string):
    secondary_page_url = get_secondary_page_url(date_string)
    
    if not secondary_page_url:
        print('Unable to get secondary page url for {}'.format(date_string))
        return None
    
    secondary_page = requests.get(secondary_page_url)
    secondary_page_soup = BeautifulSoup(secondary_page.text, "html.parser")

    pdf_link = secondary_page_soup.find("a", string = pdf_string_to_find).get("href")
    print(pdf_link)
    
    return pdf_link.strip()

In [7]:
# https://stackoverflow.com/questions/62075033/read-pdf-from-url-to-memory-omitting-saving-file-to-local-file

def get_raw_data(date_string):
    pdf_link = get_pdf_link(date_string)
        
    if not pdf_link:
        print('Unable to get pdf link for {}.'.format(date_string))
        return None, None
        
    http = urllib3.PoolManager()
    temp = io.BytesIO()
    temp.write(http.request("GET", pdf_link).data)

    raw_df = pd.DataFrame()

    with pdfplumber.open(temp) as pdf: # pdf = pdfplumber.open(temp) 
        for pdf_page in pdf.pages:
            table = pdf_page.extract_table()
            if not table or pdf_table_loc_string_to_find not in table[1]:
                continue
            raw_df = raw_df.append(pd.DataFrame(table[3:]), ignore_index=True) 

    raw_df = raw_df.iloc[:, table_column_index_to_select].dropna()
    
    page_break_problematic_rows_idx = raw_df.index[raw_df[0]=='']
    col_iloc = 2
    
    for row_idx in page_break_problematic_rows_idx:
        value = raw_df.iloc[row_idx - 1, col_iloc] + raw_df.iloc[row_idx, col_iloc]
        raw_df.iloc[row_idx - 1, col_iloc] = value
    
    raw_df = raw_df.drop(page_break_problematic_rows_idx).reset_index(drop = True)
    
    raw_df.columns = raw_table_header
    return raw_df, pdf_link

## Data Cleaning

In [8]:
# https://github.com/hxchua/datadoubleconfirm/blob/master/notebooks/covid_public_places.py
def clean_data(raw_df, pdf_link):
    df = raw_df.copy(deep=True)
    df['Location (Address)'] = df['Location (Address)'].str.replace('\n','•')
    df['Location (Address)'] = df['Location (Address)'].str.replace('','•')
    df['Location (Address)'] = df['Location (Address)'].str.replace('\n•','•')
    df[['Location','Sub-location']] = df['Location (Address)'].str.split('•',expand=True)
    df['Location'] = df['Location'].str.replace('\n',' ')
    df['Location'] = df['Location'].str.title()
    df['Location'] = df['Location'].str.strip()
    df['Sub-location'] = df['Sub-location'].str.replace('\n',', ')
    df['Sub-location'] = df['Sub-location'].str.strip()
    df['Sub-location'] = df['Sub-location'].str.title()
    df['Notes'] = "Added/Updated on " + datetime.today().strftime('%Y-%m-%d')
    df['Source'] = pdf_link
    df['Time'] = df['Time'].str.replace('to','-')

    df['Date'] = df['Date'].str.replace(r'\s? to \d+', '', regex=True)
    df['Date'] = df['Date'].str.replace('\n','')
    df['Date'] = (df['Date']+' {}'.format(year)).astype(str) # TODO
    df['Date'] = df['Date'].str.replace(' ','-')
    df['Date'] = pd.to_datetime(df['Date'],format='%d-%b-%Y')

    df = df[['Date','Time','Location','Sub-location','Source','Notes']]

    return df

## Compare and get new data

In [9]:
prev_date = (datetime.strptime(date, "%d %b %Y").date() - timedelta(days=1)).strftime("%d %b %Y")

new_raw_data, new_pdf_link = get_raw_data(date)
old_raw_data, old_pdf_link = get_raw_data(prev_date)

if not new_pdf_link or not old_pdf_link:
    print('Data for the day is not available (yet).')
    #return None
    
new_clean_data = clean_data(new_raw_data, new_pdf_link)
old_clean_data = clean_data(old_raw_data, old_pdf_link)

https://www.moh.gov.sg/news-highlights/details/15-new-cases-of-locally-transmitted-covid-19-infection_25JunUpdate
https://www.moh.gov.sg/docs/librariesprovider5/pressroom/annexes-25-jun.pdf?sfvrsn=b0c3657d_2
https://www.moh.gov.sg/news-highlights/details/14-new-cases-of-locally-transmitted-covid-19-infection24june
https://www.moh.gov.sg/docs/librariesprovider5/default-document-library/annexesa25fa689b5a8454fae0f5c8d795cca2f.pdf?sfvrsn=4bf7e3e9_0


In [10]:
combined_df = new_clean_data.append(old_clean_data)
start_date = str(datetime.strptime(date, "%d %b %Y").date() - timedelta(days = 13))

new_fourteen_days_df = combined_df.drop_duplicates(subset=['Date', 'Time', 'Location', 'Sub-location'], keep = False, inplace = False)
new_entries_df = new_fourteen_days_df[new_fourteen_days_df['Date'] >= start_date].sort_values(by=['Time'], ascending=True).sort_values(by=['Date'], ascending=False).reset_index(drop=True)

new_entries_df

Unnamed: 0,Date,Time,Location,Sub-location,Source,Notes
0,2021-06-24,1400h - 1430h,Ntuc Fairprice (166 Bukit Merah Central),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
1,2021-06-23,1200h - 1230h,Depot Heights Shopping Centre (108 Depot Road),Yue Hua Food Court,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
2,2021-06-23,1425h - 1540h,Sheng Siong Supermarket (19 Serangoon North Av...,,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
3,2021-06-22,2015h - 2200h,Courts Megastore (50 Tampines North Drive 2),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
4,2021-06-22,1820h - 1850h,Tiong Bahru Plaza (302 Tiong Bahru Road),Ntuc Fairprice,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
5,2021-06-22,1345h - 1510h,Hotel 81 Selegie (161 Selegie Road),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
6,2021-06-22,1750h - 2020h,Ikea Tampines (60 Tampines North Drive 2),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
7,2021-06-21,0800h - 0955h,Eastpoint Mall (3 Simei Street 6),Ntuc Fairprice,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
8,2021-06-20,0810h - 0900h,Eastpoint Mall (3 Simei Street 6),Ntuc Fairprice,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
9,2021-06-20,1820h - 1855h,Hotel 81 Selegie (161 Selegie Road),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26


## Combine with existing data

In [11]:
past_data_df = pd.read_excel(data_dot_world_url)

In [12]:
def get_updated_df(past_data_df, new_entries_df):
    new_data_df = past_data_df.append(new_entries_df)
    
    new_data_df['Date'] = pd.to_datetime(new_data_df['Date'],format='%Y-%m-%d')
    new_data_df['Location'] = new_data_df['Location'].str.title()
    new_data_df['Sub-location'] = new_data_df['Sub-location'].str.title()
    new_data_df = new_data_df.sort_values(by=['Time'], ascending=True).sort_values(by=['Date'], ascending=False).reset_index(drop=True)
    
    counts_df = new_data_df.groupby(['Date', 'Time', 'Location', 'Sub-location']).size().reset_index(name="counts")

    if len(counts_df[counts_df['counts'] > 1]) > 0:
        print('There are no new additions.')
        return past_data_df
        
    return new_data_df

In [13]:
updated_df = get_updated_df(past_data_df, new_entries_df)
updated_df.head(10)

There are no new additions.


Unnamed: 0,Date,Time,Location,Sub-location,Source,Notes
0,2021-06-24,1400h - 1430h,Ntuc Fairprice (166 Bukit Merah Central),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
1,2021-06-23,1200h - 1230h,Depot Heights Shopping Centre (108 Depot Road),Yue Hua Food Court,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
2,2021-06-23,1425h - 1540h,Sheng Siong Supermarket (19 Serangoon North Av...,,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
3,2021-06-22,1905h - 1940h,Plaza Singapura (68 Orchard Road),Kopitiam,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-25
4,2021-06-22,1345h - 1510h,Hotel 81 Selegie (161 Selegie Road),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
5,2021-06-22,2010h - 2100h,Lucky Plaza (304 Orchard Road),J Star Mobile Electronic,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-25
6,2021-06-22,2145h - 2225h,Orchard Central (181 Orchard Road),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-25
7,2021-06-22,1750h - 2020h,Ikea Tampines (60 Tampines North Drive 2),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26
8,2021-06-22,2110h - 2145h,313@Somerset (313 Orchard Road),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-25
9,2021-06-22,2015h - 2200h,Courts Megastore (50 Tampines North Drive 2),,https://www.moh.gov.sg/docs/librariesprovider5...,Added/Updated on 2021-06-26


## Push new data to Google sheet

In [14]:
if update_google_sheets:

    credentials = Credentials.from_service_account_file(KEYS_FILE, scopes=SCOPES)
    gc = gspread.authorize(credentials)

    ws = gc.open_by_key(SPREADSHEET_ID).worksheet(SHEET_NAME)

    updated_df['Date'] = updated_df['Date'].astype(str)
    updated_df.fillna('', inplace=True)

    ws.update([updated_df.columns.values.tolist()] + updated_df.values.tolist())