In [1]:
# Import libraries necessary
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime as dt
import numpy as np
import progressbar
import re
import time
from tqdm import tqdm
from sqlalchemy import create_engine 
import threading
headers = {'User-Agent': 'Chrome/73.0.3683.103'}

# Connect to a local database to directly store the web-scraped data
engine = create_engine('postgresql+psycopg2://postgres:password@localhost/EWG_new', client_encoding='utf 8') 
engine1 = create_engine('postgresql+psycopg2://postgres:password@localhost/Project', client_encoding='utf 8')

In [2]:
# Read in a list of systems we want to scrape for
df=pd.read_csv('../Data/ListofSystems.csv')

In [3]:
systems=df['0'].unique()

In [3]:
# Define a function to scrape for sampling records
def scrape_contam(systemids):
    for sys in tqdm(systemids):
        mess=requests.get('https://www.ewg.org/tapwater/system.php?pws=' + sys, headers=headers)
        mess=BeautifulSoup(mess.content, 'html.parser')
        block1=mess.find_all('div',{'class':'contaminant-grid-item'})
        if block1:
            for contam in block1:
                name=contam.find('div',{'class':"contaminant-name"}).find('h3').text
                url=contam.find('div',{'class':"testing-summary-btn-wrapper"}).find('a')['href']
                contammess=requests.get('https://www.ewg.org/tapwater/' + url, headers=headers)
                contammess=BeautifulSoup(contammess.content, 'html.parser')
                entries=contammess.find_all('tbody')
                if entries:
                    try:
                        entries=entries[1].find_all('tr')[0]
                        date = entries.find_all('td', {"data-label":"Date"})
                        lab = entries.find_all('td', {"data-label":"Lab ID"})
                        value = entries.find_all('td', {"data-label":"Result"})
                        if len(date) == len(lab) and len(lab) == len(value):
                            for index in range (len(date)):
                                engine.execute("INSERT INTO contaminants (utilityid, contaminant_name, date, labid, value) VALUES (%s, %s, %s, %s, %s)", (sys, name, date[index].text, lab[index].text, value[index].text))
                        elif len(date) == len(value) and len(lab) == 0:
                            for index in range (len(date)):
                                engine.execute("INSERT INTO contaminants (utilityid, contaminant_name, date, labid, value) VALUES (%s, %s, %s, %s, %s)", (sys, name, date[index].text, None, value[index].text))
                    except IndexError:
                        pass

        block2=mess.find_all('div', {'class': 'slide-toggle'})
        if block2:
            block2 = block2[0].find_all('a')
            for contam in block2:
                contammess=requests.get('https://www.ewg.org/tapwater/' + contam['href'], headers=headers)
                contammess=BeautifulSoup(contammess.content, 'html.parser')
                entries=contammess.find_all('tbody')
                if entries:
                    try:
                        entries=entries[1].find_all('tr')[0]
                        date = entries.find_all('td', {"data-label":"Date"})
                        lab = entries.find_all('td', {"data-label":"Lab ID"})
                        value = entries.find_all('td', {"data-label":"Result"})
                        if len(date) == len(lab) and len(lab) == len(value):
                            for index in range (len(date)):
                                engine.execute("INSERT INTO contaminants (utilityid, contaminant_name, date, labid, value) VALUES (%s, %s, %s, %s, %s)", (sys, contam.text, date[index].text, lab[index].text, value[index].text))
                        elif len(date) == len(value) and len(lab) == 0:
                            for index in range (len(date)):
                                engine.execute("INSERT INTO contaminants (utilityid, contaminant_name, date, labid, value) VALUES (%s, %s, %s, %s, %s)", (sys, contam.text, date[index].text, None, value[index].text))
                    except IndexError:
                        pass
        time.sleep(10)

In [5]:
# Split the file into many subsets for parallel computing
a=systems[:4800]
b=systems[4800:9600]
c=systems[9600:14400]
d=systems[14400:19200]
e=systems[19200:24000]
f=systems[24000:28800]
g=systems[28800:33600]
h=systems[33600:38400]
i=systems[38400:43200]
j=systems[43200:]

In [None]:
# Execute parallel computing to scrape for multiple sampling records at once
t1 = threading.Thread(target=scrape_contam, args=(a,)) 
t2 = threading.Thread(target=scrape_contam, args=(b,)) 
t3 = threading.Thread(target=scrape_contam, args=(c,)) 
t4 = threading.Thread(target=scrape_contam, args=(d,)) 
t5 = threading.Thread(target=scrape_contam, args=(e,)) 
t6 = threading.Thread(target=scrape_contam, args=(f,))
t7 = threading.Thread(target=scrape_contam, args=(g,)) 
t8 = threading.Thread(target=scrape_contam, args=(h,)) 
t9 = threading.Thread(target=scrape_contam, args=(i,)) 
t10 = threading.Thread(target=scrape_contam, args=(j,))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t8.start()
t9.start()
t10.start()

t1.join()
print("A is done")
t2.join()
print("B is done")
t3.join()
print("C is done")
t4.join()
print("D is done")
t5.join()
print("E is done")
t6.join()
print("F is done")
t7.join()
print("G is done")
t8.join()
print("H is done")
t9.join()
print("I is done")
t10.join()
print("J is done")

## Checking for excluded systems

In [4]:
# Query the database for systems that were successfully scraped
round1=pd.read_sql_query("select utilityid from contaminants", con=engine)

In [6]:
# Compute the set difference between the complete set of systems and the current set we obtained
exclude=np.setdiff1d(systems,round1['utilityid'].unique())

In [7]:
# The number of systems not scraped
len(exclude)

1646

In [16]:
# Save the excluded systems for record
pd.DataFrame(exclude,columns=['utilityid']).to_csv('Systems_excluded.csv',index=False)

In [8]:
# Scrape for these systems again and through parallel computing
a=exclude[:170]
b=exclude[170:340]
c=exclude[340:510]
d=exclude[510:680]
e=exclude[680:850]
f=exclude[850:1020]
g=exclude[1020:1190]
h=exclude[1190:1360]
i=exclude[1360:1530]
j=exclude[1530:]

In [None]:
# Execute parallel computing
t1 = threading.Thread(target=scrape_contam, args=(a,)) 
t2 = threading.Thread(target=scrape_contam, args=(b,)) 
t3 = threading.Thread(target=scrape_contam, args=(c,)) 
t4 = threading.Thread(target=scrape_contam, args=(d,)) 
t5 = threading.Thread(target=scrape_contam, args=(e,)) 
t6 = threading.Thread(target=scrape_contam, args=(f,))
t7 = threading.Thread(target=scrape_contam, args=(g,)) 
t8 = threading.Thread(target=scrape_contam, args=(h,)) 
t9 = threading.Thread(target=scrape_contam, args=(i,)) 
t10 = threading.Thread(target=scrape_contam, args=(j,))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()
t7.start()
t8.start()
t9.start()
t10.start()

t1.join()
print("A is done")
t2.join()
print("B is done")
t3.join()
print("C is done")
t4.join()
print("D is done")
t5.join()
print("E is done")
t6.join()
print("F is done")
t7.join()
print("G is done")
t8.join()
print("H is done")
t9.join()
print("I is done")
t10.join()
print("J is done")