In [1]:

import requests
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata
import io

In [2]:
TABLE_URL = "https://www.canada.ca/en/public-health/services/surveillance/respiratory-virus-detections-canada/2018-2019/respiratory-virus-detections-isolations-week-01-ending-january-5-2019.html"


In [3]:
resp = requests.get(TABLE_URL, headers={"User-Agent": "Mozilla/5.0"}, timeout=30)
resp.raise_for_status()

In [4]:
resp

<Response [200]>

In [5]:
soup = BeautifulSoup(resp.text, "lxml")

In [8]:
table_tag = None
for t in soup.select("table"):
    cap = t.find("caption")
    if cap and "Table 2" in cap.get_text():
        table_tag = t
        break

In [9]:
table_tag

<table class="table table-condensed table-striped table-hover table-bordered small" id="t2">
<caption class="text-left wb-inv">
      Table 2: Respiratory Virus Detections/Isolations for the period August 26, 2018 - January 5, 2019 (Reporting Weeks 201835-201901)
      </caption>
<thead>
<tr class="bg-primary">
<th scope="col">Reporting Laboratory</th>
<th scope="col">Flu Tested</th>
<th scope="col">A(H1N1)pdm09 Positive</th>
<th scope="col">A(H3) Positive</th>
<th scope="col">A(UnS) Positive</th>
<th scope="col">Total Flu A Positive</th>
<th scope="col">Total Flu B Positive</th>
<th scope="col">RSV Tested</th>
<th scope="col">RSV Positive</th>
<th scope="col">PIV Tested</th>
<th scope="col">PIV 1 Positive</th>
<th scope="col">PIV 2 Positive</th>
<th scope="col">PIV 3 Positive</th>
<th scope="col">PIV 4 Positive</th>
<th scope="col">Other PIV Positive</th>
<th scope="col">Adeno Tested</th>
<th scope="col">Adeno Positive</th>
<th scope="col">hMPV Tested</th>
<th scope="col">hMPV Positiv

In [11]:
dfs = pd.read_html(io.StringIO(str(table_tag)))
if not dfs:
    raise RuntimeError("Failed to parse Table 2 into a DataFrame.")


In [14]:
len(dfs)

1

In [18]:
df =dfs[0]
df.head()

Unnamed: 0,Reporting Laboratory,Flu Tested,A(H1N1)pdm09 Positive,A(H3) Positive,A(UnS) Positive,Total Flu A Positive,Total Flu B Positive,RSV Tested,RSV Positive,PIV Tested,...,PIV 4 Positive,Other PIV Positive,Adeno Tested,Adeno Positive,hMPV Tested,hMPV Positive,Entero/Rhino Tested,Entero/Rhino Positive,Coron Tested,Coron Positive
0,Newfoundland,1299,1,0,113,114,1,1299,91,1299,...,0,0,1299,12,1299,8,1299,200,N.A.,N.A.
1,Prince Edward Island,307,38,0,0,38,0,305,5,53,...,3,0,48,5,48,0,48,21,48,0
2,Nova Scotia,864,0,0,52,52,1,869,45,322,...,1,0,322,0,322,3,322,53,322,1
3,New Brunswick,4271,42,1,715,758,2,4274,131,1185,...,29,0,1185,84,1185,7,1185,201,1185,6
4,Atlantic,6741,81,1,880,962,4,6747,272,2859,...,33,0,2854,101,2854,18,2854,475,1555,7


In [19]:
df.columns

Index(['Reporting Laboratory', 'Flu Tested', 'A(H1N1)pdm09 Positive',
       'A(H3) Positive', 'A(UnS) Positive', 'Total Flu A Positive',
       'Total Flu B Positive', 'RSV Tested', 'RSV Positive', 'PIV Tested',
       'PIV 1 Positive', 'PIV 2 Positive', 'PIV 3 Positive', 'PIV 4 Positive',
       'Other PIV Positive', 'Adeno Tested', 'Adeno Positive', 'hMPV Tested',
       'hMPV Positive', 'Entero/Rhino Tested', 'Entero/Rhino Positive',
       'Coron Tested', 'Coron Positive'],
      dtype='object')

In [20]:
region_col = df.columns[0]
df[region_col] = df[region_col].astype(str).str.replace("\xa0", " ", regex=False).str.strip()

In [21]:
df[region_col]

Unnamed: 0,Reporting Laboratory
0,Newfoundland
1,Prince Edward Island
2,Nova Scotia
3,New Brunswick
4,Atlantic
5,Région Nord-Est
6,Québec-Chaudière-Appalaches
7,Centre-du-Québec
8,Montréal-Laval
9,Ouest du Québec


In [24]:
df[region_col].value_counts()

Unnamed: 0_level_0,count
Reporting Laboratory,Unnamed: 1_level_1
Newfoundland,1
Prince Edward Island,1
Nova Scotia,1
New Brunswick,1
Atlantic,1
Région Nord-Est,1
Québec-Chaudière-Appalaches,1
Centre-du-Québec,1
Montréal-Laval,1
Ouest du Québec,1


### now every region has one row and only one row