# Data Extraction From PDF

## 1. Check Java Versions

In [1]:
!java -version

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)


## 2. Import libraries

In [2]:
!pip install -q tabula-py

[K     |████████████████████████████████| 10.4MB 2.8MB/s 
[?25h

In [4]:
import tabula

tabula.environment_info()

Python version:
    3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
Java version:
    openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
tabula-py version: 2.1.1
platform: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
uname:
    uname_result(system='Linux', node='e9d7091df8e1', release='4.19.112+', version='#1 SMP Thu Jul 23 08:00:38 PDT 2020', machine='x86_64', processor='x86_64')
linux_distribution: ('Ubuntu', '18.04', 'bionic')
mac_ver: ('', ('', '', ''), '')
    


In [5]:
import pandas as pd

## 3. Library for this notebook

In [25]:
def extract_hk_situation_en_new_df(pdf=None, pdf_all=None):
  # Elements are pandas pandas.core.frame.DataFrame
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_new_end = 0
  
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 8) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_new_end = i + 1
    else:
      break
  print("The first %s pages are new cases" % guess_new_end)

  df_new_only = pd.concat([pdf_all[i] for i in range(guess_new_end)])
  print("There are %s new cases" % df_new_only.shape[0])
  
  human_columns_new = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
  
  # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_new_only.columns, human_columns_new))
  df_new_only = df_new_only.rename(columns=columns_rename_dict)
  return df_new_only

In [26]:
def extract_hk_situation_en_old_df(pdf=None, pdf_all=None):
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_old = []
  # --- old
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 9) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_old.append(i)
  
  print("The old list is %s pages long" % len(guess_old))

  df_old_only = pd.concat([pdf_all[i] for i in guess_old])
  print("There are %s old cases" % df_old_only.shape[0])
  
  human_columns_full = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'Hospitalised/Discharged/Deceased', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
    # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_old_only.columns, human_columns_full))
  df_old_only = df_old_only.rename(columns=columns_rename_dict)

  return df_old_only

In [30]:
## 4. Data Extraction

### 4.1 Obtain the data in PDF

In [19]:
!curl -O https://raw.githubusercontent.com/hkkenneth/covid-19-hk-data/2020-08-14/2020-08-14/local_situation_covid19_en.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 9536k  100 9536k    0     0  12.3M      0 --:--:-- --:--:-- --:--:-- 12.3M


In [31]:
### 4.2 Use Tabula to extract data

In [20]:
pdf = 'local_situation_covid19_en.pdf'

In [21]:
pdf_all = tabula.read_pdf(pdf, pages='all', lattice=True)

Got stderr: Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache
Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider <init>
Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 1:54:34 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 1:54:36 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font TimesNewRomanPSMT are not implemented in PDFBox and will be ignored



In [27]:
hk_situation_en_new_df = extract_hk_situation_en_new_df(pdf_all=pdf_all)

The first 3 pages are new cases
There are 48 new cases


In [29]:
hk_situation_en_old_df = extract_hk_situation_en_old_df(pdf_all=pdf_all)

The old list is 180 pages long
There are 4313 old cases


## 5. Sanity Check

In [32]:
hk_situation_en_new_df.shape

(48, 8)

In [34]:
hk_situation_en_new_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,HK/Non-HK resident,Case classification,Confirmed/probable
0,4314,14/08/2020,Asymptomatic,M,31,Unknown,Imported case,Confirmed
1,4315,14/08/2020,Asymptomatic,F,13,HK Resident,Epidemiologically linked with local case,Confirmed
2,4316,14/08/2020,03/08/2020,F,38,HK Resident,Local case,Confirmed
3,4317,14/08/2020,Asymptomatic,M,62,HK Resident,Epidemiologically linked with local case,Confirmed
4,4318,14/08/2020,31/07/2020,M,47,HK Resident,Local case,Confirmed


In [36]:
hk_situation_en_new_df['Gender'].value_counts()

M    24
F    24
Name: Gender, dtype: int64

In [40]:
hk_situation_en_new_df['HK/Non-HK resident'].value_counts()

HK Resident    47
Unknown         1
Name: HK/Non-HK resident, dtype: int64

In [39]:
hk_situation_en_new_df['Case classification'].value_counts()

Epidemiologically linked with local case    35
Local case                                  11
Imported case                                2
Name: Case classification, dtype: int64

In [38]:
hk_situation_en_new_df['Confirmed/probable'].value_counts()

Confirmed    48
Name: Confirmed/probable, dtype: int64

In [41]:
hk_situation_en_old_df.shape

(4313, 9)

In [42]:
hk_situation_en_old_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Case classification,Confirmed/probable
0,1,23/01/2020,21/01/2020,M,39,Discharged,Non-HK resident,I,Confirmed
1,2,23/01/2020,18/01/2020,M,56,Discharged,HK resident,I,Confirmed
2,3,24/01/2020,20/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
3,4,24/01/2020,23/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
4,5,24/01/2020,23/01/2020,M,63,Discharged,Non-HK resident,I,Confirmed


In [43]:
hk_situation_en_old_df['Gender'].value_counts()

F    2171
M    2142
Name: Gender, dtype: int64

In [44]:
hk_situation_en_old_df['Hospitalised/Discharged/Deceased'].value_counts()

Discharged      3392
Hospitalised     829
Deceased          66
No admission      26
Name: Hospitalised/Discharged/Deceased, dtype: int64

In [45]:
hk_situation_en_old_df['HK/Non-HK resident'].value_counts()

HK Resident        2817
HK resident        1377
Unknown              98
Non-HK resident      20
Non-HK Resident       1
Name: HK/Non-HK resident, dtype: int64

In [46]:
hk_situation_en_old_df['Case classification'].value_counts()

Epi-L     1844
L         1189
I         1084
PL         103
Epi-PL      62
Epi-I       31
Name: Case classification, dtype: int64

In [47]:
hk_situation_en_old_df['Confirmed/probable'].value_counts()

Confirmed    4312
Probable        1
Name: Confirmed/probable, dtype: int64