# Scraping Virginia Courts Website

In [3]:
# Import:
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support.ui import Select
import datetime
from selenium.webdriver.common.keys import Keys

### Setting Dates to Iterate Over

In [4]:
# generating first 2 months of 2019, from https://stackoverflow.com/questions/4479800/python-generate-dates-series

dt = datetime.datetime(2019, 1, 1)
end = datetime.datetime(2019, 2, 28)
delta = datetime.timedelta(days=1)

dates = []

while dt <= end:
    if dt.isoweekday() != 6 and dt.isoweekday() != 7: # removing weekends
        dates.append(dt.strftime('%m/%d/%Y'))
    dt += delta

In [6]:
# 43 days for scraping first 2 months in 2019
len(dates)

43

### List of Districts

In [None]:
court_districts = [
    "Accomack General District Court",
    "Albemarle General District Court",
    "Alexandria General District Court",
    "Alleghany General District Court",
    "Amelia General District Court",
    "Amherst General District Court",
    "Appomattox General District Court",
    "Arlington General District Court",
    "Augusta General District Court",
    "Bath General District Court",
    "Bedford General District Court",
    "Bland General District Court",
    "Botetourt General District Court",
    "Bristol General District Court",
    "Brunswick General District Court",
    "Buchanan General District Court",
    "Buckingham General District Court",
    "Buena Vista General District Court",
    "Campbell General District Court",
    "Caroline General District Court",
    "Carroll General District Court",
    "Charles City General District Court",
    "Charlotte General District Court",
    "Charlottesville General District Court",
    "Chesapeake General District Court",
    "Chesterfield General District Court",
    "Clarke General District Court",
    "Colonial Heights General District Court",
    "Craig General District Court",
    "Culpeper General District Court",
    "Cumberland General District Court",
    "Danville General District Court",
    "Dickenson General District Court",
    "Dinwiddie General District Court",
    "Emporia General District Court",
    "Essex General District Court",
    "Fairfax City General District Court",
    "Fairfax County General District Court",
    "Falls Church General District Court",
    "Fauquier General District Court",
    "Floyd General District Court",
    "Fluvanna General District Court",
    "Franklin City General District Court",
    "Franklin County General District Court",
    "Frederick General District Court",
    "Fredericksburg General District Court",
    "Galax General District Court",
    "Giles General District Court",
    "Gloucester General District Court",
    "Goochland General District Court",
    "Grayson General District Court",
    "Greene General District Court",
    "Greensville General District Court",
    "Halifax General District Court",
    "Hampton General District Court",
    "Hanover General District Court",
    "Harrisonburg/Rockingham General District Court",
    "Henrico General District Court",
    "Henry General District Court",
    "Highland General District Court",
    "Hopewell General District Court",
    "Isle of Wight General District Court",
    "King George General District Court",
    "King William General District Court",
    "King and Queen General District Court",
    "Lancaster General District Court",
    "Lee General District Court",
    "Lexington/Rockbridge General District Court",
    "Loudoun General District Court",
    "Louisa General District Court",
    "Lunenburg General District Court",
    "Lynchburg General District Court",
    "Madison General District Court",
    "Martinsville General District Court",
    "Mathews General District Court",
    "Mecklenburg General District Court",
    "Middlesex General District Court",
    "Montgomery/Blacksburg General District Court",
    "Montgomery/Christiansburg General District Court",
    "Nelson General District Court",
    "New Kent General District Court",
    "Newport News-Civil General District Court",
    "Newport News-Criminal General District Court",
    "Newport News-Traffic General District Court",
    "Norfolk General District Court",
    "Norfolk General District-Criminal Division",
    "Norfolk General District-Traffic Division",
    "Norfolk General District-Civil Division",
    "Northampton General District Court",
    "Northumberland General District Court",
    "Nottoway General District Court",
    "Orange General District Court",
    "Page General District Court",
    "Patrick General District Court",
    "Petersburg General District Court",
    "Pittsylvania General District Court",
    "Portsmouth General District Court",
    "Powhatan General District Court",
    "Prince Edward General District Court",
    "Prince George General District Court", 
    "Prince William General District Court",
    "Pulaski General District Court",
    "Radford General District Court",
    "Rappahannock General District Court",
    "Richmond County General District Court",
    "Richmond-Civil General District Court",
    "Richmond-Marsh Criminal/Traffic General District Court at Manchester",
    "Richmond-John Marshall Criminal/Traffic General District Court",
    "Richmond Manchester General District Court",
    "Roanoke City General District Court",
    "Roanoke County General District Court",
    "Russell General District Court",
    "Salem General District Court",
    "Scott General District Court",
    "Shenandoah General District Court",
    "Smyth General District Court",
    "Southampton General District Court",
    "Spotsylvania General District Court",
    "Stafford General District Court",
    "Staunton General District Court",
    "Suffolk General District Court",
    "Surry General District Court",
    "Sussex General District Court",
    "Tazewell General District Court",
    "Virginia Beach General District Court",
    "Warren General District Court",
    "Washington General District Court",
    "Waynesboro General District Court",
    "Westmoreland General District Court",
    "Williamsburg/James City County General District Court",
    "Winchester General District Court",
    "Wise/Norton General District Court",
    "Wythe General District Court",
    "York General District Court"]

In [1]:
BUCKET1 = ["Fairfax County General District Court"]
BUCKET2 = ["Arlington General District Court", 
           "Chesapeake General District Court"]
BUCKET3 = ["Prince William General District Court",
           "Virginia Beach General District Court",
           "Alexandria General District Court"]
BUCKET4 = ["Henrico General District Court",
           "Norfolk General District Court",
           "Hampton General District Court"]
BUCKET5 = ["Loudoun General District Court",
           "Richmond-John Marshall Criminal/Traffic General District Court",
           "Harrisonburg/Rockingham General District Court",
           "Chesterfield General District Court"]
BUCKET6 = ["Wythe General District Court",
           "Hopewell General District Court",
           "Newport News-Traffic General District Court",
           "Smyth General District Court",
           "Albemarle General District Court",
           "Hanover General District Court",
           "Portsmouth General District Court"]
BUCKET7 = ["Brunswick General District Court",
           "Roanoke County General District Court",
           "Spotsylvania General District Court",
           "Wise/Norton General District Court",
           "Isle of Wight General District Court",
           "Greensville General District Court",
           "Richmond-Marsh Criminal/Traffic General District Court at Manchester",
           "Northampton General District Court",
           "Lynchburg General District Court",
           "Montgomery/Christiansburg General District Court"]
BUCKET8 = ["Suffolk General District Court",
           "Colonial Heights General District Court",
           "Roanoke City General District Court",
           "Frederick General District Court",
           "Danville General District Court",
           "Bedford General District Court",
           "Southampton General District Court",
           "Newport News-Criminal General District Court",
           "Augusta General District Court",
           "Fauquier General District Court",
           "Salem General District Court",
           "Washington General District Court",
           "Petersburg General District Court",
           "York General District Court"]
BUCKET9 = ["Winchester General District Court",
           "Fredericksburg General District Court"
           "Accomack General District Court",
           "Gloucester General District Court",
           "Clarke General District Court",
           "Williamsburg/James City County General District Court",
           "Russell General District Court",
           "Prince Edward General District Court",
           "Pulaski General District Court",
           "Franklin County General District Court",
           "Shenandoah General District Court",
           "Orange General District Court",
           "Bland General District Court",
           "Lexington/Rockbridge General District Court",
           "Warren General District Court",
           "Halifax General District Court",
           "Sussex General District Court",
           "Tazewell General District Court"]
BUCKET10 = ["Culpeper General District Court",
            "Charlottesville General District Court",
            "Dinwiddie General District Court",
            "Scott General District Court",
            "Amherst General District Court",
            "Madison General District Court",
            "Pittsylvania General District Court",
            "Prince George General District Court",
            "Caroline General District Court",
            "Giles General District Court",
            "Stafford General District Court",
            "Amelia General District Court",
            "Nelson General District Court",
            "Carroll General District Court",
            "Goochland General District Court",
            "New Kent General District Court",
            "Botetourt General District Court",
            "Staunton General District Court",
            "Nottoway General District Court",
            "Greene General District Court",
            "Mecklenburg General District Court",
            "Westmoreland General District Court",
            "Rappahannock General District Court",
            "Bristol General District Court",
            "Appomattox General District Court",
            "Radford General District Court",
            "Louisa General District Court",
            "Campbell General District Court",
            "Page General District Court",
            "Waynesboro General District Court",
            "Buckingham General District Court",
            "Fluvanna General District Court",
            "Buchanan General District Court",
            "Patrick General District Court",
            "Alleghany General District Court",
            "Craig General District Court",
            "Emporia General District Court",
            "King and Queen General District Court",
            "Fairfax City General District Court",
            "King William General District Court",
            "Dickenson General District Court",
            "Cumberland General District Court",
            "Falls Church General District Court",
            "Richmond County General District Court",
            "Charlotte General District Court",
            "Henry General District Court",
            "Bath General District Court",
            "Charles City General District Court",
            "Franklin City General District Court",
            "Floyd General District Court",
            "Lancaster General District Court",
            "Lunenburg General District Court",
            "Essex General District Court",
            "Surry General District Court",
            "Norfolk General District-Criminal Division",
            "Lee General District Court",
            "Powhatan General District Court",
            "King George General District Court",
            "Norfolk General District-Traffic Division",
            "Richmond Manchester General District Court",
            "Newport News-Civil General District Court",
            "Grayson General District Court",
            "Highland General District Court",
            "Martinsville General District Court",
            "Buena Vista General District Court"]

## Setting up Webdriver

In [None]:
# Create a driver called "driver"
driver = webdriver.Chrome(executable_path='./chromedriver')

In [None]:
# landing page url
url = 'https://eapps.courts.state.va.us/gdcourts/captchaVerification.do?landing=landing'

In [None]:
# open it up
driver.get(url)

In [None]:
# click on "I Agree"
accept_btn = driver.find_element_by_class_name("g-recaptcha")
accept_btn.click()

(complete captcha)

## Full Scraping Code

In [None]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.findAll('tbody')

# setting up empty case list to add to
cases_list = []

# iterating through courts
for row in court_districts:
    court = row
    court_field = driver.find_element_by_css_selector(".ui-autocomplete-input")
    court_field.clear()
    sleep(2)
    court_field = driver.find_element_by_css_selector(".ui-autocomplete-input")
    court_field.send_keys(court)
    click_outside = driver.find_element_by_css_selector(".navigationtop").click()
    sleep(1)
    # clicking on 'hearing date search' after picking court
    date_search = driver.find_element_by_link_text("Hearing Date Search")
    date_search.click()
    sleep(1)

    # iterate through dates
    for date in dates_test:
        hearing_date = date
        hearing_field = driver.find_element_by_id("txthearingdate")
        driver.find_element_by_id("txthearingdate").clear()
        hearing_field.send_keys(hearing_date)
        sleep(.5) # set sleep to create delays to the next code
        search_btn = driver.find_element_by_class_name("submitbox")
        search_btn.click()
        sleep(1)

## SCRAPING ONE PAGE WORTH OF CASES
        #checking if there were hearings that day/at end of hearings
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        no_more_pages = 0
        while no_more_pages == 0:
            if soup.find('td', {'class': 'errorFont'}):
                print("No hearings")
                break
            elif no_more_pages == 1:
                print("Finished scraping for {}".format(date))
                break
            else:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                table = soup.findAll('tbody')
                cases = table[10]
                case_links = table[10].findAll('a')
                scrolls = 0
                # clicking on a link
                for i in case_links:
                    link = i.text.strip('\n').strip('\t').strip().strip('\n').strip('-00')
                    body = driver.find_element_by_tag_name('body')
                    print(link)
                    try:
                        driver.find_element_by_partial_link_text(link).click()
                    except:
                        body.send_keys(Keys.PAGE_DOWN)
                        sleep(1)
                        driver.find_element_by_partial_link_text(link).click()
                    # new page
                    sleep(.5)
                    body = driver.find_element_by_tag_name('body')
                    body.send_keys(Keys.PAGE_DOWN)
                    sleep(1)
                    body.send_keys(Keys.PAGE_DOWN)
            
                    # scraping individual case info
                    # setting up BeautifulSoup for new page
                    soup = BeautifulSoup(driver.page_source, 'html.parser')
                    body = driver.find_element_by_tag_name('body')
                    table = soup.findAll('tbody')

                    # locating tables with specific info
                    case_info = table[7]
                    charge_info = table[9]
                    hearing_info = table[10]
                    disposition_info = table[16].findAll('td')[1]

                    # setting empty dictionary for each case
                    case = {}

                    # setting up fields to pull from case info
                    case['case_number'] = case_info.findAll('td')[1].text.strip('\xa0')
                    case['filed_date'] = case_info.findAll('td')[3].text.strip('\xa0')
                    case['locality'] = case_info.findAll('td')[5].text.strip('\xa0')
                    case['name'] = case_info.findAll('td')[7].text.strip('\xa0')
                    case['status'] = case_info.findAll('td')[9].text.strip('\xa0')
                    case['defense_attn'] = case_info.findAll('td')[11].text.strip('\xa0')
                    case['address'] = case_info.findAll('td')[13].text.strip('\xa0').strip()
                    case['gender'] = case_info.findAll('td')[19].text.strip('\xa0').strip()
                    case['race'] = case_info.findAll('td')[21].text.strip('\xa0').strip()
                    case['dob'] = case_info.findAll('td')[23].text.strip('\xa0').strip()

                    # setting fields to pull from charge info
                    case['charge'] = charge_info.findAll('td')[1].text.strip('\xa0').strip()
                    case['code'] = charge_info.findAll('td')[3].text.strip('\xa0').strip()
                    case['charge_type'] = charge_info.findAll('td')[5].text.strip('\xa0').strip()
                    case['charge_class'] = charge_info.findAll('td')[7].text.strip('\xa0').strip()
                    case['offense_date'] = charge_info.findAll('td')[9].text.strip('\xa0').strip()
                    case['arrest_date'] = charge_info.findAll('td')[11].text.strip('\xa0').strip()
                    case['complainant'] = charge_info.findAll('td')[13].text.strip('\xa0').strip()
                    case['amended_charge'] = charge_info.findAll('td')[15].text.strip('\xa0').strip()

                    # setting fields to pull from hearings info
                    case['hearing_date_date'] = date
                    case['court'] = court
                    case['all_hearings'] = hearing_info.findAll('td')[1].text

                    # setting fields to pull from disposition info
                    case['verdict'] = disposition_info.findAll('tr')[1].text.strip('Final Disposition\xa0:\n\xa0\n\t\t\t\t').strip()
                    case['sentence_time'] = disposition_info.findAll('tr')[2].text.strip()
                    case['probation_time'] = disposition_info.findAll('tr')[3].text.strip()
                    case['license_suspension'] = disposition_info.findAll('tr')[4].text.strip()
                    case['fines_costs'] = disposition_info.findAll('tr')[6].text

                    # adding to list of cases
                    cases_list.append(case)
                
                    # return to previous page
                    sleep(.5)
                    driver.find_element_by_xpath("//input[@value='Back to Search Results']").click()
                    sleep(1)
                
                # checking if last page / moving to next page
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                has_next = driver.find_elements_by_xpath("//input[@value='Next']")
                if len(has_next) != 0:
                    no_more_pages = 0
                    driver.find_element_by_xpath("//input[@value='Next']").click()
                    sleep(2)
                else:
                    no_more_pages = 1

In [None]:
# check length of scraped list
len(cases_list)

In [None]:
# save the 
cases_df = pd.DataFrame(cases_list)
cases_df.to_csv('NAME')

### If scrape breaks

In [None]:
cases_list = []

In [None]:
## BEFORE YOU RUN THIS: manually set the date and court field in the dictionary below
soup = BeautifulSoup(driver.page_source, 'html.parser')
no_more_pages = 0
while no_more_pages == 0:
    if soup.find('td', {'class': 'errorFont'}):
        print("No hearings")
        break
    elif no_more_pages == 1:
        print("Finished scraping for {}".format(date))
        break
    else:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.findAll('tbody')
        cases = table[10]
        case_links = table[10].findAll('a')
        scrolls = 0
        
        # clicking on a link
        for i in case_links:
            link = i.text.strip('\n').strip('\t').strip().strip('\n')
            body = driver.find_element_by_tag_name('body')
            print(link)
            try:
                driver.find_element_by_partial_link_text(link).click()
            except:
                body.send_keys(Keys.PAGE_DOWN)
                sleep(1)
                driver.find_element_by_partial_link_text(link).click()
                    
            sleep(.5)
            body = driver.find_element_by_tag_name('body')
            body.send_keys(Keys.PAGE_DOWN)
            sleep(.5)
            body.send_keys(Keys.PAGE_DOWN)
            
            # setting up BeautifulSoup for new page
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            body = driver.find_element_by_tag_name('body')
            table = soup.findAll('tbody')

            # locating tables with specific info
            case_info = table[7]
            charge_info = table[9]
            hearing_info = table[10]
            disposition_info = table[16].findAll('td')[1]

            # setting empty dictionary for each case
            case = {}

            # setting up fields to pull from case info
            case['case_number'] = case_info.findAll('td')[1].text.strip('\xa0')
            case['filed_date'] = case_info.findAll('td')[3].text.strip('\xa0')
            case['locality'] = case_info.findAll('td')[5].text.strip('\xa0')
            case['name'] = case_info.findAll('td')[7].text.strip('\xa0')
            case['status'] = case_info.findAll('td')[9].text.strip('\xa0')
            case['defense_attn'] = case_info.findAll('td')[11].text.strip('\xa0')
            case['address'] = case_info.findAll('td')[13].text.strip('\xa0').strip()
            case['gender'] = case_info.findAll('td')[19].text.strip('\xa0').strip()
            case['race'] = case_info.findAll('td')[21].text.strip('\xa0').strip()
            case['dob'] = case_info.findAll('td')[23].text.strip('\xa0').strip()

            # setting fields to pull from charge info
            case['charge'] = charge_info.findAll('td')[1].text.strip('\xa0').strip()
            case['code'] = charge_info.findAll('td')[3].text.strip('\xa0').strip()
            case['charge_type'] = charge_info.findAll('td')[5].text.strip('\xa0').strip()
            case['charge_class'] = charge_info.findAll('td')[7].text.strip('\xa0').strip()
            case['offense_date'] = charge_info.findAll('td')[9].text.strip('\xa0').strip()
            case['arrest_date'] = charge_info.findAll('td')[11].text.strip('\xa0').strip()
            case['complainant'] = charge_info.findAll('td')[13].text.strip('\xa0').strip()
            case['amended_charge'] = charge_info.findAll('td')[15].text.strip('\xa0').strip()

            # setting fields to pull from hearings info
            case['hearing_date_date'] = "DATE"
            case['court'] = "COURT"
            case['all_hearings'] = hearing_info.findAll('td')[1].text

            # setting fields to pull from disposition info
            case['verdict'] = disposition_info.findAll('tr')[1].text.strip('Final Disposition\xa0:\n\xa0\n\t\t\t\t').strip()
            case['sentence_time'] = disposition_info.findAll('tr')[2].text.strip()
            case['probation_time'] = disposition_info.findAll('tr')[3].text.strip()
            case['license_suspension'] = disposition_info.findAll('tr')[4].text.strip()
            case['fines_costs'] = disposition_info.findAll('tr')[6].text

            # adding to list of cases
            cases_list.append(case)
                
            # return to previous page
            sleep(.5)
            driver.find_element_by_xpath("//input[@value='Back to Search Results']").click()
            sleep(1)
                
        # checking if last page / moving to next page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        has_next = driver.find_elements_by_xpath("//input[@value='Next']")
        if len(has_next) != 0:
            no_more_pages = 0
            driver.find_element_by_xpath("//input[@value='Next']").click()
            sleep(2)
                
        else:
            no_more_pages = 1

In [None]:
len(cases_list)

In [None]:
cases_df = pd.DataFrame(cases_list)
cases_df.to_csv('NAME')