# Data Extraction From PDF

## 1. Check Java Versions

In [1]:
!java -version

openjdk version "1.8.0_222"
OpenJDK Runtime Environment (Zulu 8.40.0.25-CA-linux64) (build 1.8.0_222-b10)
OpenJDK 64-Bit Server VM (Zulu 8.40.0.25-CA-linux64) (build 25.222-b10, mixed mode)


## 2. Import libraries

In [2]:
!pip install -q tabula-py

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import tabula

tabula.environment_info()

Python version:
    3.6.9 (default, Aug 14 2019, 12:46:29) 
[GCC 8.3.0]
Java version:
    openjdk version "1.8.0_222"
OpenJDK Runtime Environment (Zulu 8.40.0.25-CA-linux64) (build 1.8.0_222-b10)
OpenJDK 64-Bit Server VM (Zulu 8.40.0.25-CA-linux64) (build 25.222-b10, mixed mode)
tabula-py version: 2.1.1
platform: Linux-4.15.0-1077-aws-x86_64-with-debian-10.0
uname:
    uname_result(system='Linux', node='4220029f5519', release='4.15.0-1077-aws', version='#81-Ubuntu SMP Wed Jun 24 16:48:15 UTC 2020', machine='x86_64', processor='')
linux_distribution: ('Debian GNU/Linux', '10', 'buster')
mac_ver: ('', ('', '', ''), '')
    


In [4]:
import pandas as pd

## 3. Library for this notebook

In [5]:
def extract_hk_situation_en_new_df(pdf=None, pdf_all=None):
  # Elements are pandas pandas.core.frame.DataFrame
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_new_end = 0
  
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 8) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_new_end = i + 1
    else:
      break
  print("The first %s pages are new cases" % guess_new_end)

  df_new_only = pd.concat([pdf_all[i] for i in range(guess_new_end)])
  print("There are %s new cases" % df_new_only.shape[0])
  
  human_columns_new = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
  
  # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_new_only.columns, human_columns_new))
  df_new_only = df_new_only.rename(columns=columns_rename_dict)
  return df_new_only

In [6]:
def extract_hk_situation_en_old_df(pdf=None, pdf_all=None):
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_old = []
  # --- old
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 9) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_old.append(i)
  
  print("The old list is %s pages long" % len(guess_old))

  df_old_only = pd.concat([pdf_all[i] for i in guess_old])
  print("There are %s old cases" % df_old_only.shape[0])
  
  human_columns_full = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'Hospitalised/Discharged/Deceased', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
    # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_old_only.columns, human_columns_full))
  df_old_only = df_old_only.rename(columns=columns_rename_dict)

  return df_old_only

In [7]:
## 4. Data Extraction

### 4.1 Obtain the data in PDF

In [8]:
!curl -O https://raw.githubusercontent.com/hkkenneth/covid-19-hk-data/2020-08-14/2020-08-14/local_situation_covid19_en.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9536k  100 9536k    0     0  53.2M      0 --:--:-- --:--:-- --:--:-- 53.2M


In [9]:
### 4.2 Use Tabula to extract data

In [10]:
pdf = 'local_situation_covid19_en.pdf'

In [11]:
pdf_all = tabula.read_pdf(pdf, pages='all', lattice=True)

Got stderr: Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache
Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 3:19:42 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 3:19:43 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 3:19:43 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font TimesNewRomanPSMT are not implemented in PDFBox and will be ignored



In [12]:
hk_situation_en_new_df = extract_hk_situation_en_new_df(pdf_all=pdf_all)

The first 3 pages are new cases
There are 48 new cases


In [13]:
hk_situation_en_old_df = extract_hk_situation_en_old_df(pdf_all=pdf_all)

The old list is 180 pages long
There are 4313 old cases


## 5. Sanity Check

In [14]:
hk_situation_en_new_df.shape

(48, 8)

In [15]:
hk_situation_en_new_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,HK/Non-HK resident,Case classification,Confirmed/probable
0,4314,14/08/2020,Asymptomatic,M,31,Unknown,Imported case,Confirmed
1,4315,14/08/2020,Asymptomatic,F,13,HK Resident,Epidemiologically linked with local case,Confirmed
2,4316,14/08/2020,03/08/2020,F,38,HK Resident,Local case,Confirmed
3,4317,14/08/2020,Asymptomatic,M,62,HK Resident,Epidemiologically linked with local case,Confirmed
4,4318,14/08/2020,31/07/2020,M,47,HK Resident,Local case,Confirmed


In [16]:
hk_situation_en_new_df['Gender'].value_counts()

F    24
M    24
Name: Gender, dtype: int64

In [17]:
hk_situation_en_new_df['HK/Non-HK resident'].value_counts()

HK Resident    47
Unknown         1
Name: HK/Non-HK resident, dtype: int64

In [18]:
hk_situation_en_new_df['Case classification'].value_counts()

Epidemiologically linked with local case    35
Local case                                  11
Imported case                                2
Name: Case classification, dtype: int64

In [19]:
hk_situation_en_new_df['Confirmed/probable'].value_counts()

Confirmed    48
Name: Confirmed/probable, dtype: int64

In [20]:
hk_situation_en_old_df.shape

(4313, 9)

In [21]:
hk_situation_en_old_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Case classification,Confirmed/probable
0,1,23/01/2020,21/01/2020,M,39,Discharged,Non-HK resident,I,Confirmed
1,2,23/01/2020,18/01/2020,M,56,Discharged,HK resident,I,Confirmed
2,3,24/01/2020,20/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
3,4,24/01/2020,23/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
4,5,24/01/2020,23/01/2020,M,63,Discharged,Non-HK resident,I,Confirmed


In [22]:
hk_situation_en_old_df['Gender'].value_counts()

F    2171
M    2142
Name: Gender, dtype: int64

In [23]:
hk_situation_en_old_df['Hospitalised/Discharged/Deceased'].value_counts()

Discharged      3392
Hospitalised     829
Deceased          66
No admission      26
Name: Hospitalised/Discharged/Deceased, dtype: int64

In [24]:
hk_situation_en_old_df['HK/Non-HK resident'].value_counts()

HK Resident        2817
HK resident        1377
Unknown              98
Non-HK resident      20
Non-HK Resident       1
Name: HK/Non-HK resident, dtype: int64

In [25]:
hk_situation_en_old_df['Case classification'].value_counts()

Epi-L     1844
L         1189
I         1084
PL         103
Epi-PL      62
Epi-I       31
Name: Case classification, dtype: int64

In [26]:
hk_situation_en_old_df['Confirmed/probable'].value_counts()

Confirmed    4312
Probable        1
Name: Confirmed/probable, dtype: int64