# Scrape the UTK COVID-19 dashboard data

This notebook is to scrape UTK's COVID-19 dashboard data, currently located at https://veoci.com/veoci/p/form/4jmds5x4jj4j#tab=entryForm

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
URL = 'https://veoci.com/veoci/p/form/4jmds5x4jj4j#tab=entryForm'
response = requests.get(URL)

soup = BeautifulSoup(response.content, 'html5lib')

### Following quick start direction in Beautiful Soup documentation

https://beautiful-soup-4.readthedocs.io/en/latest/

In [3]:
soup.title

<title> Dashboard for Temporary Space Closures - Veoci (Public Form)</title>

In [4]:
soup.title.name

'title'

In [5]:
soup.title.string

' Dashboard for Temporary Space Closures - Veoci (Public Form)'

In [6]:
soup.title.parent.name

'head'

In [7]:
soup.p

<p>Desktop notifications let you know when you have a new message or alert. When you enable these notifications, you will see a pop-up when new messages arrive. </p>

In [8]:
soup.a

<a class="skip-content-link" href="#mainContent" target="_self">Skip masthead links</a>

In [9]:
soup.find_all('a')

[<a class="skip-content-link" href="#mainContent" target="_self">Skip masthead links</a>,
 <a href="/veoci" target="_top" title="Logout of Veoci">Sign Out</a>,
 <a id="header_home_link" style="float:left;"></a>,
 <a class="masthead-login-link" href="#" style="color: white">Sign In</a>,
 <a href="https://veoci.com/legal/privacy-policy" title="Privacy Policy">Privacy Policy</a>,
 <a href="https://veoci.com/legal/terms-and-conditions" title="Terms &amp; Conditions">Terms &amp; Conditions</a>,
 <a class="white-login-link" href="/privacy.html" style="font-size:12px;">Privacy Policy</a>,
 <a class="white-login-link" href="/terms.html" style="font-size:12px;">Terms &amp; Conditions</a>,
 <a class="white-login-link" href="/contact.html" style="font-size:12px;">Contact Us</a>,
 <a class="white-login-link" href="http://www.veoci.com" style="color:#333;" target="_blank" title="http://www.veoci.com">© Veoci / Grey Wall Software, LLC  <img align="baseline" border="0" height="9" src="https://static-

In [10]:
len(soup.contents)

2

In [11]:
soup.contents[0].name

In [12]:
soup.contents[1].name

'html'

In [13]:
len(list(soup.children))

2

In [14]:
len(list(soup.descendants))

467

In [15]:
soup.find_all('div', class_='field-data-wrapper.edit-mode')

[]

In [16]:
clean_this_up = soup.find('script', onerror="onScriptError(event)", type='text/javascript')

In [17]:
type(clean_this_up)

bs4.element.Tag

In [18]:
import pandas as pd

In [19]:
from selenium import webdriver
from PIL import Image

In [20]:
with webdriver.Safari() as driver:
    driver.get(URL)
    driver.save_screenshot('screenshot.png')

In [21]:
# xpath for entire image -- must be preceded by driver.maximize_window()
# to be sure the data we want is displayed
# /html/body/div[2]/div[2]
# xpath for closed building table
# //*[@id="entryWrapper"]/div[3]/div[2]
# xpath for Active Covid data
# //*[@id="entryWrapper"]/div[3]/div[3]

In [22]:
with webdriver.Safari() as driver:
    driver.get(URL)
    driver.maximize_window()
    driver.save_screenshot('screenshot.png')

In [23]:
with webdriver.Safari() as driver:
    driver.get(URL)
    element = driver.find_element_by_xpath('//*[@id="entryWrapper"]/div[3]/div[3]')
    image_data = element.screenshot_as_png

In [24]:
import io
dataBytesIO = io.BytesIO(image_data)

In [25]:
image = Image.open(dataBytesIO)

In [26]:
image.show()

In [27]:
import cv2
import numpy as np

In [28]:
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

In [29]:
import pytesseract

In [30]:
text = pytesseract.image_to_string(image)


In [31]:
print(text)

Active COVID-19 Cases

Students 18
Faculty 4
Staff 4

Cumulative since August 9, 2020: 4
Cumulative June 8, 2020 thru August 8, 2020: 113

Updated: 8/11/20 1336 tl



In [37]:
driver = webdriver.Firefox()
driver.close()

In [35]:
!mv /Users/dlisla/Downloads/geckodriver /Users/dlisla/.virtualenvs/labtest1/bin/

In [51]:
options = webdriver.FirefoxOptions()
#options.add_argument('--headless')
#options.headless = True
#options.add_argument("window-size=5120,2880")
driver = webdriver.Firefox(options=options)
driver.get(URL)
driver.set_window_size(1000, 2000)
size = driver.get_window_size()
print("Window size: width = {}px, height = {}px".format(size["width"], size["height"]))
driver.save_screenshot('screenshot.png')
driver.close()

Window size: width = 1000px, height = 1417px


In [116]:
options = webdriver.FirefoxOptions()
#options.add_argument('--headless')
#options.headless = True
#options.add_argument("window-size=5120,2880")
driver = webdriver.Firefox(options=options)
driver.get(URL)
driver.set_window_size(1000, 1500)
size = driver.get_window_size()
print("Window size: width = {}px, height = {}px".format(size["width"], size["height"]))
element = driver.find_element_by_xpath('/html/body/div[2]/div[2]')
image_data = element.screenshot_as_png
driver.close()

Window size: width = 1000px, height = 1417px


In [128]:
with webdriver.Safari() as driver:
    driver.get(URL)
    driver.set_window_size(1000, 1500)
    # driver.maximize_window()
    element = driver.find_element_by_xpath('/html/body/div[2]/div[2]')
    image_data = element.screenshot_as_png

In [129]:
image_bytes = io.BytesIO(image_data)
image = Image.open(image_bytes)
text = pytesseract.image_to_string(image)

In [130]:
image_gray = image.convert('L')
# image_gray.show()

In [101]:
threshold = 170
image_bitonal_170 = image_gray.point(lambda pixel: pixel > threshold and 255)
image_bitonal_170.show()

In [97]:
r, g, b, _ = image.split()
r.show()

In [90]:
bitonal_r = r.convert('1', dither=Image.NONE)
bitonal_r.show()

In [98]:
r.info

{}

In [131]:
text = pytesseract.image_to_string(image_gray)

In [132]:
print(text.split('Learn more about what happens with a COVID-19 case is reported.')[1].split('View Acknowledgement')[0].strip())

Building Room Reason Reopen Date

Alumni Memorial 33 Symptomatic and Self-lsolating August 12, 2020
Student Union 368 Symptomatic and Self-lsolating August 15, 2020
Austin Peay B104, 109, 113 and 125 Symptomatic and Self Isolating August 12, 2020

Active COVID-19 Cases

Students 18
Faculty 4
Staff 4

Cumulative since August 9, 2020: 4
Cumulative June 8, 2020 thru August 8, 2020: 113

Updated: 8/11/20 1336 tl
