# Municode Scraper
This scraper has 3 functions
- list_of_towns() scrapes the State-wide webpage and returns a single-column dataframe containing the links to each town/county's webpage on Municode
- identify_comparison_table_URL_part1() scrapes the town/county webpage and identifies the part of the webpage that contains information about state vs local law codes
    - This needs to be improved for accuracy. I currently wrote it to just take the last url on the page but that should be amended to actually search for the page that holds the table comparing state and local law codes. The subpages that hold this table have different names depending on the county. 
- scraper() takes in the subpage url that was identified in identify_comparison_table_URL_part1() and scrapes the webpage, returning a two-column dataframe with the State law codes in the first column and the corresponding town/county law codes in the second column
    - This needs to be improved for specificity around what is being scraped. Some subpages contain multiple tables, some contain tables of different dimensions (4 columns instead of 2 columns) -- these differences need to be accounted for so that the right information is being returned in the dataframe.

In [3]:
import pandas as pd 
import os 
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import requests


In [6]:
def list_of_towns():
    driver = webdriver.Chrome('/Users/holdenbruce/Downloads/chromedriver3')

    # set implicit wait time so that apis/javascript load before we scrape 
    driver.implicitly_wait(5) # seconds

    # url of the county
    url = f"https://library.municode.com/VA"
    # headers to let them know who i am
    headers = {'user-agent': 'class project (hab6xf@virginia.edu)'}
    # xpath of the table in the webpage created by javascript 
    xpathHOME = "/html/body/div[1]/div[2]/ui-view/div[2]/section/div/div"

    driver.get(url)

    # use xpath to get to the table
    data = driver.find_elements_by_xpath(xpathHOME)
    # links = driver.find_elements_by_tag_name("a")
    
    # add a delay of 3 seconds in the function
    time.sleep(2)
    
    # use outerHTML to maintain the html/css/javasript code pulled from the webpage 
    html = data[0].get_attribute("outerHTML")

    r1 = re.findall(r"https://library.municode.com/VA/[\w\.-]+",html)

    # convert to dataframe and drop duplicates
    towns_list = pd.DataFrame(r1)
    towns_list = towns_list.drop_duplicates().reset_index(drop=True)

    return towns_list

In [7]:
town_urls = list_of_towns()
town_urls = town_urls.rename(columns={0:"urls"})
town_urls

Unnamed: 0,urls
0,https://library.municode.com/VA/accomack_county
1,https://library.municode.com/VA/albemarle_county
2,https://library.municode.com/VA/alexandria
3,https://library.municode.com/VA/alleghany_county
4,https://library.municode.com/VA/altavista
...,...
142,https://library.municode.com/VA/winchester
143,https://library.municode.com/VA/windsor
144,https://library.municode.com/VA/wise_county
145,https://library.municode.com/VA/woodstock


In [10]:
def identify_comparison_table_URL_part1(url):
    driver = webdriver.Chrome('/Users/holdenbruce/Downloads/chromedriver3')

    # set implicit wait time so that apis/javascript load before we scrape 
    driver.implicitly_wait(5) # seconds

    # url of the county
    # url = f"https://library.municode.com/va/{town}/codes/code_of_ordinances"
    # headers to let them know who i am
    # headers = {'user-agent': 'class project (hab6xf@virginia.edu)'}
    # xpath of the table in the webpage created by javascript 
    xpath = "/html/body/div[1]/div[2]/ui-view/mcc-codes/div[2]/section[1]/div[2]"

    driver.get(url)

    # use xpath to get to the table
    data = driver.find_elements_by_xpath(xpath)
    # links = driver.find_elements_by_tag_name("a")
    
    
    # add a delay of 3 seconds in the function
    time.sleep(2)
    
    
    # use outerHTML to maintain the html/css/javasript code pulled from the webpage 
    html = data[0].get_attribute("outerHTML")

    r2 = re.findall(r'(https?://[^\s]+)', html)
    
    url = r2[-1]
    url = url[:-1]
    
    return url

In [11]:
identify_comparison_table_URL_part1(town_urls.loc[0,'urls'])

'https://library.municode.com/va/accomack_county/codes/code_of_ordinances?nodeId=STLARETA'

In [40]:
def scraper(url,town):
    driver = webdriver.Chrome('/Users/holdenbruce/Downloads/chromedriver3')

    # set implicit wait time so that apis/javascript load before we scrape 
    driver.implicitly_wait(5) # seconds

    # url of the county
    url = identify_comparison_table_URL_part1(url)
    print("url:",url)
    
    # define the nodeID by taking the last piece of the ULR after the "="
    nodeID = url.split("=")[-1]
    
    # define the nodeIDs that we care about (discovered by team through manual check of Municode)
    nodeIDs = [
        'STLARETA',
        'STATE_LAW_REFERENCE_TABLE',
        'COOR_STRETA',
        'STRETA',
        'THCH_STRETA'
    ]
    
    # if nodeID not what we want, break
    if nodeID not in nodeIDs:
        print(f"pass: {town} empty")
        # define an empty dataframe
        df_empty = pd.DataFrame({'town' : []})
        
        # now write that empty dataframe to CSV
        df.to_csv(f'countyCSV/{town}.csv', index=False)  
        pass
    
    # if nodeID what we want, do this
    else:
        # xpath of the table in the webpage created by javascript 
        xpath = "/html/body/div[1]/div[2]/ui-view/mcc-codes/div[2]/section/div[1]/mcc-codes-content/div/div[2]/ul/li/mcc-codes-content-chunk/div/div/div[2]/div/div/div/div[2]/table"
        driver.get(url)

        # use xpath to get to the table
        data = driver.find_elements_by_xpath(xpath)
        #print("data:",data)
        #if 

        # use outerHTML to maintain the html/css/javasript code pulled from the webpage 
        table = data[0].get_attribute("outerHTML")
        #print("table:",table)

        # https://stackoverflow.com/questions/41214702/parse-html-and-read-html-table-with-selenium-python

        # convert that table into a pandas dataframe
        df = pd.read_html(table)
        df = df[0]


        # rename the columns
        df = df.rename(columns={'Code of Virginia  Section': "Virginia", "Section this Code":town})

        #print(url)

        # now write to CSV
        df.to_csv(f'countyCSV/{town}.csv', index=False)  

        return df

In [41]:
# scraper('https://library.municode.com/va/accomack_county',"accomack_county")

# Could also import municode.py

In [42]:
# loop through all of the towns 
# for ind in list_of_towns.index:
for ind in town_urls.index:
    url = town_urls.loc[ind,'urls']
    town = town_urls.loc[ind,'urls'].split("/")[-1]
    
    try:
        print(scraper(url,town))
    except:
        print(f'This municipality {town} does not host the necessary documents online.')
        

url: https://library.municode.com/va/accomack_county/codes/code_of_ordinances?nodeId=STLARETA
                             0                  1
0    Code of Virginia  Section  Section this Code
1                 1-13 et seq.              Ch. 1
2                       1-13.3                1-2
3               1-13.6, 1-13.7                1-2
4                       1-13.9                1-4
..                         ...                ...
401              62.1-44.15:46              38-98
402           62.1-148 et seq.   Ch. 102, Art. II
403                   62.1-255            106-376
404                   63.1-164              22-88
405          68.1-1200 et seq.              22-88

[406 rows x 2 columns]
This municipality albemarle_county does not host the necessary documents online.
This municipality alexandria does not host the necessary documents online.
url: https://library.municode.com/va/alleghany_county/codes/code_of_ordinances?nodeId=STLARETA
                              0