## Install Drivers

In [30]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time

## Generate URLs For Each State

In [41]:
driver.quit()

In [42]:
# Open chrome brower and go to state parent website
driver = webdriver.Chrome()
driver.get('https://www.payscale.com/college-salary-report/best-schools-by-state')

#Prepare csv for storage of college information, per state
csv_file = open('college_by_state.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)

#Setup headers for csv
header_dict = {'state': 'state', 'college_name': 'college_name', 'early_pay': 'early_pay', 'mid_pay': 'mid_pay', 'per_stem': 'per_stem'}
writer.writerow(header_dict.values())

#Must create list of states to generate state-specific urls
base_url = 'https://www.payscale.com/college-salary-report/best-schools-by-state/bachelors/'

#Isolate body containing all state info
states = driver.find_elements_by_xpath('//*[@class="space-bottom"]/div/div')

state_urls = []
for state in states:
    #Isolate state name per state
    state_name = state.find_element_by_xpath( './/div[@class="grid__item-header"]' ).text
    
    #Generate URL from name and base_url
    state_urls.append([state_name, base_url + re.sub( r'[^\w\s]','',state_name.lower() ).replace( " ", "-" )])
    
    #See process
    print(state_name)
    print('='*50)
    

Alabama
Alaska
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Florida
Georgia
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
New Mexico
New York
North Carolina
North Dakota
Ohio
Oklahoma
Oregon
Pennsylvania
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
Utah
Vermont
Virginia
Washington
Washington, D.C.
West Virginia
Wisconsin
Wyoming


## Process URL Per State

In [43]:
for url in state_urls[:5]: 
    #Unpack state-specific list containing name and url
    state_name, state_url = url
    
    #Go to state-specific url
    driver.get( url[1] ) #enter state-specific url

    #Determine the number of pages by <next page> buttons at bottom of page
    pages = [ el.text for el in driver.find_elements_by_xpath('//*[@class="pagination__btn--inner"]') if (el.text!='') ]

    #For states with <25 colleges, pages will be empty list
    if pages == []:
        pages = '1'

    #For each page, must isolate/process each college's information
    for page in pages:
        #Generate page url
        colleges_url = url[1] + r'/page/' + page

        #Go to page url with list of colleges
        driver.get( colleges_url )

        #Wait for page to load and isolate college data per page 
        wait_row = WebDriverWait( driver, 10 )
        rows = wait_row.until( EC.presence_of_all_elements_located((By.XPATH,
                                        '//*[@class="container csr-gridpage__grid"]/table/tbody/tr')) )
                                            
        #For each row (ie. college), scrap name, pay info, and percent stem
        for row in rows:

            #Create dict for storage/export of data per college
            college_dict = {}

            #Define xpaths
            college_name =  re.sub( r'[$,%]','', row.find_element_by_xpath('./td[2]/span[2]/a').text )
            early_pay = re.sub( r'[$,%]','', row.find_element_by_xpath('./td[4]/span[2]').text )
            mid_pay = re.sub( r'[$,%]','', row.find_element_by_xpath('./td[5]/span[2]').text )
            per_stem = re.sub( r'[$,%]','', row.find_element_by_xpath('./td[7]/span[2]').text )

            #Store college info into previously created dict
            college_dict[ 'state' ] = re.sub( r'[$,%]','', state_name.lower())
            college_dict[ 'college_name' ] = college_name
            college_dict[ 'early_pay' ] = early_pay
            college_dict[ 'mid_pay' ] = mid_pay
            college_dict[ 'per_stem' ] = per_stem

            #Just to keep track of where we are in the program
            print( state_name ) #print state_name
            print( '=' * 50 )

            #Write single college dict to csv
            writer.writerow(college_dict.values())

driver.quit()

Alabama
Auburn University
Alabama
University of Alabama in Huntsville
Alabama
The University of Alabama
Alabama
Tuskegee University
Alabama
Samford University
Alabama
Spring Hill College
Alabama
Birmingham Southern College
Alabama
University of Alabama at Birmingham
Alabama
University of South Alabama
Alabama
Alabama A & M University
Alabama
Troy University
Alabama
Jacksonville State University
Alabama
Auburn University at Montgomery
Alabama
Huntingdon College
Alabama
Oakwood University
Alabama
Athens State University
Alabama
University of West Alabama
Alabama
University of North Alabama
Alabama
University of Mobile
Alabama
Alabama State University
Alabama
Faulkner University
Alabama
University of Montevallo
Alabama
Judson College
Alabama
Amridge University
Alabama
Miles College
Alabama
Stillman College
Alabama
Talladega College
Alaska
University of Alaska Fairbanks
Alaska
University of Alaska Anchorage
Alaska
Alaska Pacific University
Arizona
Embry-Riddle Aeronautical University-Presc

California
Westmont College
California
University of Redlands
California
California State University-Long Beach
California
California State University-Sacramento
California
Notre Dame de Namur University
California
San Francisco State University
California
Chapman University
California
Scripps College
California
California State University-Fullerton
California
Southern California Institute of Architecture
California
Mount Saint Mary's University
California
Pacific Union College
California
California State University-Fresno
California
California State University-Northridge
California
Otis College of Art and Design
California
Sonoma State University
California
Pitzer College
California
John F. Kennedy University
California
Dominican University of California
California
Whittier College
California
California Lutheran University
California
California State University-San Marcos
California
National University
California
Laguna College of Art and Design
California
California State University-