# Data Extraction From PDF

## 1. Check Java Versions

In [116]:
!java -version

openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)


## 2. Import libraries

In [117]:
!pip install -q tabula-py

In [118]:
import tabula

tabula.environment_info()

Python version:
    3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]
Java version:
    openjdk version "11.0.8" 2020-07-14
OpenJDK Runtime Environment (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.8+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
tabula-py version: 2.1.1
platform: Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
uname:
    uname_result(system='Linux', node='e9d7091df8e1', release='4.19.112+', version='#1 SMP Thu Jul 23 08:00:38 PDT 2020', machine='x86_64', processor='x86_64')
linux_distribution: ('Ubuntu', '18.04', 'bionic')
mac_ver: ('', ('', '', ''), '')
    


In [119]:
import pandas as pd

In [120]:
import pickle

## 3. Library for this notebook

In [121]:
def extract_hk_situation_en_new_df(pdf=None, pdf_all=None):
  # Elements are pandas pandas.core.frame.DataFrame
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_new_end = 0
  
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 8) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_new_end = i + 1
    else:
      break
  print("The first %s pages are new cases" % guess_new_end)

  df_new_only = pd.concat([pdf_all[i] for i in range(guess_new_end)])
  print("There are %s new cases" % df_new_only.shape[0])
  
  human_columns_new = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
  
  # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_new_only.columns, human_columns_new))
  df_new_only = df_new_only.rename(columns=columns_rename_dict)
  return df_new_only

In [122]:
def extract_hk_situation_en_old_df(pdf=None, pdf_all=None):
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_old = []
  # --- old
  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 9) and (pdf_all[i].columns[0] == 'Case no.' or pdf_all[i].columns[0] == 'Case\rno.'):
      guess_old.append(i)
  
  print("The old list is %s pages long" % len(guess_old))

  df_old_only = pd.concat([pdf_all[i] for i in guess_old])
  print("There are %s old cases" % df_old_only.shape[0])
  
  human_columns_full = ['Case no.', 'Report date', 'Date of onset', 'Gender', 'Age', 'Hospitalised/Discharged/Deceased', 'HK/Non-HK resident', 'Case classification', 'Confirmed/probable']
    # Ref: https://stackoverflow.com/a/44296858
  columns_rename_dict = dict(zip(df_old_only.columns, human_columns_full))
  df_old_only = df_old_only.rename(columns=columns_rename_dict)

  return df_old_only

In [123]:
def extract_hk_situation_en_cluster_df(pdf=None, pdf_all=None):
  if pdf_all is None:
    pdf_all = tabula.read_pdf(pdf, pages="all", lattice=True)
  guess_cluster = []

  for i in range(len(pdf_all)):
    if (pdf_all[i].shape[1] == 3) and (pdf_all[i].columns[0] == 'Cluster' or (len(guess_cluster) > 0 and guess_cluster[-1] == (i - 1))):
      guess_cluster.append(i)
  
  print("The cluster list is %s pages long" % len(guess_cluster))
  
  human_columns_full = ['Cluster', 'Involved case number', 'Number of cases']

  cluster_pages = []
  df_cluster = None
  if (len(guess_cluster) > 0):
    columns_rename_dict = dict(zip(pdf_all[guess_cluster[0]].columns, human_columns_full))
    cluster_pages.append(pdf_all[guess_cluster[0]].rename(columns=columns_rename_dict))
    if (len(guess_cluster) > 1):
      for i in guess_cluster[1:]:
        cluster_pages.append(pd.DataFrame(data=dict(zip(human_columns_full, [[c] for c in pdf_all[i].columns]))))
        # Ref: https://stackoverflow.com/a/44296858
        columns_rename_dict = dict(zip(pdf_all[i].columns, human_columns_full))
        cluster_pages.append(pdf_all[i].rename(columns=columns_rename_dict))
    df_cluster = pd.concat(cluster_pages)
    print("There are %i clusters" % df_cluster.shape[0])
  else:
    pd.DataFrame(data={'Cluster': [], 'Involved case number': [], 'Number of cases': []})
  return df_cluster

## 4. Data Extraction

### 4.1 Obtain the data in PDF

In [124]:
!curl -O https://raw.githubusercontent.com/hkkenneth/covid-19-hk-data/2020-08-14/2020-08-14/local_situation_covid19_en.pdf

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 9536k  100 9536k    0     0  12.5M      0 --:--:-- --:--:-- --:--:-- 12.5M


### 4.2 Use Tabula to extract data

In [125]:
pdf = 'local_situation_covid19_en.pdf'

In [126]:
pdf_all = tabula.read_pdf(pdf, pages='all', lattice=True)

Got stderr: Aug 15, 2020 5:45:41 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 5:45:41 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 5:45:41 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Aug 15, 2020 5:45:43 PM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 <init>
INFO: OpenType Layout tables used in font TimesNewRomanPSMT are not implemented in PDFBox and will be ignored



### 4.3 New cases (english) from local situation PDF

In [127]:
hk_situation_en_new_df = extract_hk_situation_en_new_df(pdf_all=pdf_all)

The first 3 pages are new cases
There are 48 new cases


### 4.4 Old cases (english) from local situation PDF

In [128]:
hk_situation_en_old_df = extract_hk_situation_en_old_df(pdf_all=pdf_all)

The old list is 180 pages long
There are 4313 old cases


### 4.5 Mode of detection for confirmed and probable cases in Hong Kong (english) from local situation PDF

In [129]:
guess_detection_mode = -1
  
for i in range(len(pdf_all)):
  if (pdf_all[i].shape[1] == 2) and pdf_all[i].columns[0] == 'Category' and pdf_all[i].columns[1] == 'Number of cases (percentage)':
    guess_detection_mode = i

guess_detection_mode


108

In [130]:
if guess_detection_mode == -1:
  detection_mode_df = pd.DataFrame(data={'Category': [], 'Number of cases (percentage)': []})
else:
  detection_mode_df = pdf_all[guess_detection_mode]

 ### 4.6 Large clusters with 10 or more cases (english) from local situation PDF

In [131]:
hk_situation_en_cluster_df = extract_hk_situation_en_cluster_df(pdf_all=pdf_all)

The cluster list is 3 pages long
There are 26 clusters


## 5. Sanity Check

### 5.1 New cases (english)

In [132]:
hk_situation_en_new_df.shape

(48, 8)

In [133]:
hk_situation_en_new_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,HK/Non-HK resident,Case classification,Confirmed/probable
0,4314,14/08/2020,Asymptomatic,M,31,Unknown,Imported case,Confirmed
1,4315,14/08/2020,Asymptomatic,F,13,HK Resident,Epidemiologically linked with local case,Confirmed
2,4316,14/08/2020,03/08/2020,F,38,HK Resident,Local case,Confirmed
3,4317,14/08/2020,Asymptomatic,M,62,HK Resident,Epidemiologically linked with local case,Confirmed
4,4318,14/08/2020,31/07/2020,M,47,HK Resident,Local case,Confirmed


In [134]:
hk_situation_en_new_df['Gender'].value_counts()

M    24
F    24
Name: Gender, dtype: int64

In [135]:
hk_situation_en_new_df['HK/Non-HK resident'].value_counts()

HK Resident    47
Unknown         1
Name: HK/Non-HK resident, dtype: int64

In [136]:
hk_situation_en_new_df['Case classification'].value_counts()

Epidemiologically linked with local case    35
Local case                                  11
Imported case                                2
Name: Case classification, dtype: int64

In [137]:
hk_situation_en_new_df['Confirmed/probable'].value_counts()

Confirmed    48
Name: Confirmed/probable, dtype: int64

### 5.2 Old cases (english)

In [138]:
hk_situation_en_old_df.shape

(4313, 9)

In [139]:
hk_situation_en_old_df.head()

Unnamed: 0,Case no.,Report date,Date of onset,Gender,Age,Hospitalised/Discharged/Deceased,HK/Non-HK resident,Case classification,Confirmed/probable
0,1,23/01/2020,21/01/2020,M,39,Discharged,Non-HK resident,I,Confirmed
1,2,23/01/2020,18/01/2020,M,56,Discharged,HK resident,I,Confirmed
2,3,24/01/2020,20/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
3,4,24/01/2020,23/01/2020,F,62,Discharged,Non-HK resident,I,Confirmed
4,5,24/01/2020,23/01/2020,M,63,Discharged,Non-HK resident,I,Confirmed


In [140]:
hk_situation_en_old_df['Gender'].value_counts()

F    2171
M    2142
Name: Gender, dtype: int64

In [141]:
hk_situation_en_old_df['Hospitalised/Discharged/Deceased'].value_counts()

Discharged      3392
Hospitalised     829
Deceased          66
No admission      26
Name: Hospitalised/Discharged/Deceased, dtype: int64

In [142]:
hk_situation_en_old_df['HK/Non-HK resident'].value_counts()

HK Resident        2817
HK resident        1377
Unknown              98
Non-HK resident      20
Non-HK Resident       1
Name: HK/Non-HK resident, dtype: int64

In [143]:
hk_situation_en_old_df['Case classification'].value_counts()

Epi-L     1844
L         1189
I         1084
PL         103
Epi-PL      62
Epi-I       31
Name: Case classification, dtype: int64

In [144]:
hk_situation_en_old_df['Confirmed/probable'].value_counts()

Confirmed    4312
Probable        1
Name: Confirmed/probable, dtype: int64

### 5.3 Mode of detection for confirmed and probable cases in Hong Kong (english)

In [145]:
detection_mode_df.head(n=20)

Unnamed: 0,Category,Number of cases (percentage)
0,(1) Cases fulfilling the reporting criteria of...,336 (7.7%)
1,(2) Enhanced laboratory surveillance in public...,490 (11.2%)
2,(3) Enhanced surveillance at Accident and Emer...,1289 (29.6%)
3,(4) Diagnosis / Enhanced surveillance in priva...,624 (14.3%)
4,(5) Medical surveillance / contact tracing by ...,952 (21.8%)
5,(6) Enhanced surveillance for asymptomatic inb...,618 (14.2%)
6,(7) Enhanced surveillance for persons with hig...,15 (0.3%)
7,(8) Private testing for the four high-risk gro...,37 (0.8%)
8,Total,4361 (100.0%)


 ### 5.4 Large clusters with 10 or more cases (english)

In [146]:
hk_situation_en_cluster_df.columns

Index(['Cluster', 'Involved case number', 'Number of cases'], dtype='object')

In [147]:
hk_situation_en_cluster_df.head(n=30)

Unnamed: 0,Cluster,Involved case number,Number of cases
0,Diamond Princess Cruise Ship,"14, 71, 78, 79, 80, 81, 87, 88, 94, 96, 97",11
1,Hotpot dinner gathering at Kwun Tong,"27, 29, 30, 31, 32, 33, 34, 35, 36, 37, 41, 44...",13
2,Fook Wai Ching Che in Maylun\rApartments in No...,"64, 65, 70, 73, 74, 76, 77, 83, 84, 86, 89, 91...",19
3,Travel tour to Egypt / Heng Tai House\rof Fu H...,"119, 120, 121, 123, 124, 125, 126, 127, 129, 140",10
4,Bar and band cluster,"182, 227, 276, 343, 360, 362, 367, 384, 386, 3...",103
5,Wedding party at Lantau Island,"191, 217, 233, 237, 250, 253, 258, 259, 287, 2...",15
6,Kerry Logistics/ Luk Chuen House,"1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091...",12
7,Bun Kee/ Sun Fat/ Kin Wing,"1269, 1283, 1289, 1291, 1295, 1297, 1301, 1304...",46
8,Kong Tai Care for the Aged Centre,"1298, 1302, 1307, 1310, 1311, 1312, 1316, 1321...",46
9,Ming Chuen House of Shui Chuen O\rEstate,"1294, 1377, 1378, 1379, 1380, 1381, 1382, 1383...",12


## 6. Export to pickle

In [148]:
PIK = 'raw-en.dat'

In [149]:
pickle_file = open(PIK, 'wb')

In [150]:
pickle.dump(hk_situation_en_new_df, pickle_file)

In [151]:
pickle.dump(hk_situation_en_old_df, pickle_file)

In [152]:
pickle.dump(detection_mode_df, pickle_file)

In [153]:
pickle.dump(hk_situation_en_cluster_df, pickle_file)

In [154]:
pickle_file.close()

In [None]:
for c in hk_situation_en_new_df.columns[1:]:
  hk_situation_en_new_df[c] = hk_situation_en_new_df[c].astype('str')

In [182]:
hk_situation_en_new_df.to_parquet('raw-new-en.parquet')

In [183]:
for c in hk_situation_en_old_df.columns[1:]:
  hk_situation_en_old_df[c] = hk_situation_en_old_df[c].astype('str')

In [184]:
hk_situation_en_old_df.to_parquet('raw-old-en.parquet')

In [187]:
for c in detection_mode_df.columns:
  detection_mode_df[c] = detection_mode_df[c].astype('str')

In [188]:
detection_mode_df.to_parquet('raw-mode-en.parquet')

In [190]:
for c in hk_situation_en_cluster_df.columns:
  hk_situation_en_cluster_df[c] = hk_situation_en_cluster_df[c].astype('str')

In [191]:
hk_situation_en_cluster_df.to_parquet('raw-cluster-en.parquet')