## Import Packages

In [244]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time
import numpy as np
import pandas as pd 

## Get List of State URLs

In [249]:
# Open chrome brower and go to state parent website
driver = webdriver.Chrome()
driver.get('https://www.payscale.com/college-salary-report/best-schools-by-state')

In [245]:
# Open chrome brower and go to state parent website
driver = webdriver.Chrome()
driver.get('https://www.payscale.com/college-salary-report/best-schools-by-state')

#Isolate body containing all state info
states = driver.find_elements_by_xpath('//*[@class="space-bottom"]/div/div')

#Must create list of states to generate state-specific urls
associate_base_url = 'https://www.payscale.com/college-salary-report/best-schools-by-state/2-year-colleges/'
bachelor_base_url = 'https://www.payscale.com/college-salary-report/best-schools-by-state/bachelors/'

state_urls = []
for state in states:
    #Isolate state name and urls per state
    state_name = state.find_element_by_xpath( './/div[@class="grid__item-header"]' ).text
    associate_url = associate_base_url + re.sub( r'[^\w\s]','',state_name.lower() ).replace( " ", "-" )
    bachelor_url = bachelor_base_url + re.sub( r'[^\w\s]','',state_name.lower() ).replace( " ", "-" )

    #Generate URL from name and base_url
    state_urls.append([state_name, associate_url, bachelor_url])

## Prep Repository DF For State ROI Input

In [246]:
repo_df = pd.DataFrame({'state_name': [],\
                        'degree_type':[],\
                        'college_name':[],\
                        'early_salary':[],\
                        'mid_salary': [],\
                        'per_stem':[]})

## Scrape Each State

In [247]:
counter = 0
for url in state_urls: 

    #Unpack state-specific list containing name and urls
    state_name, associates_url, bachelors_url = url
    print(f"{state_name}: initiate")
    
    for bachelors_degree in [True, False]:
    
        #Go to state-specific url
        if bachelors_degree:
            degree_type = 'bachelors'
            degree_url = bachelors_url
            driver.get( degree_url ) #enter state-specific url
        else:
            degree_type = 'associates'
            degree_url = associates_url
            driver.get( degree_url ) #enter state-specific url            

        #Determine the number of pages by <next page> buttons at bottom of page
        pages = [ el.text for el in driver.find_elements_by_xpath('//*[@class="pagination__btn--inner"]') if (el.text!='') ]
        
        #For states with <25 colleges, pages will be empty list
        if pages == []:
            pages = '1'

        #For each page, must isolate/process each college's information
        for page in pages:
            #Generate page url
            page_url = degree_url + r'/page/' + page

            #Go to page url with list of colleges
            driver.get( page_url )

            #Wait for page to load and isolate college data per page 
            wait_row = WebDriverWait( driver, 10 )
            colleges = wait_row.until( EC.presence_of_all_elements_located((By.XPATH,
                                            '//*[@class="container csr-gridpage__grid"]/table/tbody/tr')) )

            #Scrape info per row (ie. college)
            for college in colleges:

                #Define additional xpaths and scrape college information
                college_name =  college.find_element_by_xpath('./td[2]/span[2]/a').text
                early_salary = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[4]/span[2]').text )
                mid_salary = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[5]/span[2]').text )
                per_stem = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[7]/span[2]').text ) 

                #Store all info to nested list to then create a df later
                college_df = pd.DataFrame({'state_name': [state_name],\
                                    'degree_type':[degree_type],\
                                    'college_name':[college_name],\
                                    'early_salary':[early_salary],\
                                    'mid_salary':[mid_salary],\
                                    'per_stem':[per_stem]})

                #Add college_df to repository df
                repo_df = pd.concat([repo_df, college_df], axis=0)

                #Keep track of number of colleges
                counter += 1
            
    print(f"{state_name}: complete")
    print('='*25,counter, '='*25)

driver.close()

Alabama: initiate
Alabama: complete
Alaska: initiate
Alaska: complete
Arizona: initiate
Arizona: complete
Arkansas: initiate
Arkansas: complete
California: initiate
California: complete
Colorado: initiate
Colorado: complete
Connecticut: initiate
Connecticut: complete
Delaware: initiate
Delaware: complete
Florida: initiate
Florida: complete
Georgia: initiate
Georgia: complete
Hawaii: initiate
Hawaii: complete
Idaho: initiate
Idaho: complete
Illinois: initiate
Illinois: complete
Indiana: initiate
Indiana: complete
Iowa: initiate
Iowa: complete
Kansas: initiate
Kansas: complete
Kentucky: initiate
Kentucky: complete
Louisiana: initiate
Louisiana: complete
Maine: initiate
Maine: complete
Maryland: initiate
Maryland: complete
Massachusetts: initiate
Massachusetts: complete
Michigan: initiate
Michigan: complete
Minnesota: initiate
Minnesota: complete
Mississippi: initiate
Mississippi: complete
Missouri: initiate
Missouri: complete
Montana: initiate
Montana: complete
Nebraska: initiate
Nebrask

## Export Repository DF As CSV

In [248]:
repo_df.to_csv('college_salary_bystate.csv', index=None, header=True)