## Import Packages

In [238]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time
import numpy as np

## Get List of State URLs

In [239]:
# Open chrome brower and go to state parent website
driver = webdriver.Chrome()
base_url = 'https://www.payscale.com/college-roi/state'
driver.get(base_url)

#Isolate body containing all state info
states = driver.find_elements_by_xpath('//*[@class="col-xs-12 col-sm-4"]/p')

state_urls = []
for state in states:
    #Isolate state name per state
    state_name = state.find_element_by_xpath( './/a' ).text
    
    #Generate URL from name and base_url
    state_urls.append([state_name, base_url + '/' + re.sub( r'[^\w\s]','',state_name.lower() ).replace( " ", "-" )])

## Prep CSV For State ROI Input

In [240]:
#Prepare csv for storage of college information, per state
csv_file = open('college_roi_bystate.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)

#Setup headers for csv
header_dict = {'state_name': 'state_name', 'college_name': 'college_name', 'college_type': 'college_type', \
               'roi': 'roi', 'total_cost': 'total_cost', 'grad_rate': 'grad_rate', \
               'yrs2grad': 'yrs2grad', 'financial_aid': 'financial_aid', \
               'loan_amt': 'loan_amt', 'perc_grant': 'perc_grant'}

## Scrape Each State

In [241]:
counter = 0
for url in state_urls: 
  
    #Unpack state-specific list containing name and url
    state_name, state_url = url
    
    #Go to state-specific url
    driver.get( url[1] ) #enter state-specific url
    
    # Click review button to go to the review section
    oncampus_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div/div/button')
    oncampus_button.click()

    roi_type_button = driver.find_element_by_xpath('//div[@id="roi-list-root"]/div/div/div[3]//button[1]')
    roi_type_button.click()

    load_more_button = driver.find_element_by_xpath('//*[@class="col-md-4 offset-md-2"]')
    load_more_button.click()
    
    #Must run twice to account for financial aid
    for aid in [True, False]:

        #Want to collect data with and without financial aid
        if aid:
            #Store data WITH financial aid
            financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[2]')
            financial_aid_button.click() 
        else:
            #Store data WITHOUT financial aid
            financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[1]')
            financial_aid_button.click() 

        #Wait for entire ROI to load
        wait_colleges = WebDriverWait( driver, 10 )
        colleges = wait_colleges.until( EC.presence_of_all_elements_located((By.XPATH, \
                                    '//*[@class="table college-roi-table table-bordered table-striped table-condensed"]/tbody/tr')) )

        #Scrape info per row (ie. college)
        for college in colleges:

            #Create dict for storage/export of data per college
            college_dict = {}

            #Scrape ROI first to determine if it is worth scraping rest of college info
            if aid:
                roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[9]/span').text )
            else:
                roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[7]/span').text )

            #Define additional xpaths and scrape college information
            text = college.find_element_by_xpath('./td[2]//div').text.split("(") #roughly extract text
            college_name =  re.sub( r'[)$,%(-]','', " ".join(text[:-1]) ) #join college name
            college_type = re.sub( r'[)$,%(-]','', text[-1] ) #clean up college type
            total_cost = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[11]/span').text )
            grad_rate = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[13]/span').text )
            yrs2grad = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[14]/span').text )

            # Scrap loan information if relevant
            if aid:
                loan_amt = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[15]/span').text )
                perc_grant = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[16]/span').text )
            else:
                loan_amt = np.nan
                perc_grant = np.nan 

            # Scrap loan information if relevant
            if aid:
                loan_amt = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[15]/span').text )
                perc_grant = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[16]/span').text )
            else:
                loan_amt = np.nan
                perc_grant = np.nan       

            #Store all info to dict
            college_dict[ 'state_name' ] = state_name.lower()
            college_dict[ 'college_name' ] =  college_name.lower()
            college_dict[ 'college_type' ] = college_type.lower()
            college_dict[ 'roi' ] = roi
            college_dict[ 'total_cost' ] = total_cost
            college_dict[ 'grad_rate' ] = grad_rate
            college_dict[ 'yrs2grad' ] = yrs2grad
            college_dict[ 'financial_aid'] = aid
            college_dict[ 'loan_amt' ] = loan_amt
            college_dict[ 'perc_grant' ] = perc_grant

            #Write single college dict to csv
            writer.writerow(college_dict.values())
            
            print(college_name)
            
            #Keep track of number of colleges
            counter += 1
        
driver.close()

ALABAMA
ALASKA
ARIZONA
ARKANSAS
CALIFORNIA
COLORADO
CONNECTICUT
DELAWARE
FLORIDA
GEORGIA
HAWAII
IDAHO
ILLINOIS
INDIANA
IOWA
KANSAS
KENTUCKY
LOUISIANA
MAINE
MARYLAND
MASSACHUSETTS
MICHIGAN
MINNESOTA
MISSISSIPPI
MISSOURI
MONTANA
NEBRASKA
NEVADA
NEW HAMPSHIRE
NEW JERSEY
NEW MEXICO
NEW YORK
NORTH CAROLINA
NORTH DAKOTA
OHIO
OKLAHOMA
OREGON
PENNSYLVANIA
RHODE ISLAND
SOUTH CAROLINA
SOUTH DAKOTA
TENNESSE
TEXAS
UTAH
VERMONT
VIRGINIA
WASHINGTON
WEST VIRGINIA
WISCONSIN
WYOMING
WASHINGTON, D.C


In [242]:
counter




8

[1, 2, 3, 4]

In [219]:
#Prepare csv for storage of college information, per state
csv_file = open('college_roi_bystate.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csv_file)

#Setup headers for csv
header_dict = {'state_name': 'state_name', 'college_name': 'college_name', 'college_type': 'college_type', \
               'roi': 'roi', 'total_cost': 'total_cost', 'grad_rate': 'grad_rate', \
               'yrs2grad': 'yrs2grad', 'financial_aid': 'financial_aid', \
               'loan_amt': 'loan_amt', 'perc_grant': 'perc_grant'}





url = state_urls[9]

#Unpack state-specific list containing name and url
state_name, state_url = url

print(state_name)

#Go to state-specific url
driver.quit()
driver = webdriver.Chrome()
driver.get( url[1] ) #enter state-specific url

# Click review button to go to the review section
oncampus_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div/div/button')
oncampus_button.click()

roi_type_button = driver.find_element_by_xpath('//div[@id="roi-list-root"]/div/div/div[3]//button[1]')
roi_type_button.click()

load_more_button = driver.find_element_by_xpath('//*[@class="col-md-4 offset-md-2"]')
load_more_button.click()

#Must run twice to account for financial aid
for aid in [True, False]:

    #Want to collect data with and without financial aid
    if aid:
        #Store data WITH financial aid
        financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[2]')
        financial_aid_button.click() 
    else:
        #Store data WITHOUT financial aid
        financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[1]')
        financial_aid_button.click() 

    #Wait for entire ROI to load
    wait_colleges = WebDriverWait( driver, 10 )
    colleges = wait_colleges.until( EC.presence_of_all_elements_located((By.XPATH, \
                                '//*[@class="table college-roi-table table-bordered table-striped table-condensed"]/tbody/tr')) )
    
    print(len(colleges))
          
    #Scrape info per row (ie. college)
    counter = 0
    for college in colleges:

        #Create dict for storage/export of data per college
        college_dict = {}

        #Scrape ROI first to determine if it is worth scraping rest of college info
        if aid:
            roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[9]/span').text )
        else:
            roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[7]/span').text )

        if roi == 'N/A':
            continue #roi doesn't exist, no need to proceed

        #Define additional xpaths and scrape college information
        text = college.find_element_by_xpath('./td[2]//div').text.split("(") #roughly extract text
        print(text)
        college_name =  re.sub( r'[)$,%(-]','', " ".join(text[:-1]) ) #join college name
        college_type = re.sub( r'[)$,%(-]','', text[-1] ) #clean up college type
        total_cost = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[11]/span').text )
        grad_rate = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[13]/span').text )
        yrs2grad = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[14]/span').text )

        # Scrap loan information if relevant
        if aid:
            loan_amt = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[15]/span').text )
            perc_grant = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[16]/span').text )
        else:
            loan_amt = np.nan
            perc_grant = np.nan       

        #Store all info to dict
        college_dict[ 'state_name' ] = state_name.lower()
        college_dict[ 'college_name' ] =  college_name.lower()
        college_dict[ 'college_type' ] = college_type.lower()
        college_dict[ 'roi' ] = roi
        college_dict[ 'total_cost' ] = total_cost
        college_dict[ 'grad_rate' ] = grad_rate
        college_dict[ 'yrs2grad' ] = yrs2grad
        college_dict[ 'financial_aid'] = aid
        college_dict[ 'loan_amt' ] = loan_amt
        college_dict[ 'perc_grant' ] = perc_grant
        
        print(college_name)
        print(college_type)
        print(aid)
        print('='*50)
        
#Write single college dict to csv
writer.writerow(college_dict.values())

GEORGIA
57
['Georgia Institute of Technology-Main Campus', 'In-State)']
Georgia Institute of Technology-Main Campus
In-State
True
['Georgia Institute of Technology-Main Campus', 'Out-of-State)']
Georgia Institute of Technology-Main Campus
Out-of-State
True
['Emory University', 'Private)']
Emory University
Private
True
['Oglethorpe University', 'Private)']
Oglethorpe University
Private
True
['University of Georgia', 'In-State)']
University of Georgia
In-State
True
['Morehouse College', 'Private)']
Morehouse College
Private
True
['University of Georgia', 'Out-of-State)']
University of Georgia
Out-of-State
True
['Georgia State University', 'In-State)']
Georgia State University
In-State
True
['Augusta University', 'In-State)']
Augusta University
In-State
True
['Clayton State University', 'In-State)']
Clayton State University
In-State
True
['Middle Georgia State University', 'In-State)']
Middle Georgia State University
In-State
True
['Clayton State University', 'Out-of-State)']
Clayton Stat

University of Georgia
Out-of-State
False
['Morehouse College', 'Private)']
Morehouse College
Private
False
['Georgia State University', 'In-State)']
Georgia State University
In-State
False
['Clayton State University', 'In-State)']
Clayton State University
In-State
False
['Augusta University', 'In-State)']
Augusta University
In-State
False
['Middle Georgia State University', 'In-State)']
Middle Georgia State University
In-State
False
['Clayton State University', 'Out-of-State)']
Clayton State University
Out-of-State
False
['Georgia Southern University', 'In-State)']
Georgia Southern University
In-State
False
['Augusta University', 'Out-of-State)']
Augusta University
Out-of-State
False
['Georgia State University', 'Out-of-State)']
Georgia State University
Out-of-State
False
['University of North Georgia', 'In-State)']
University of North Georgia
In-State
False
['Middle Georgia State University', 'Out-of-State)']
Middle Georgia State University
Out-of-State
False
['Georgia Southern Univer

2

In [143]:
colleges[7]


<selenium.webdriver.remote.webelement.WebElement (session="8e56ddb635881748db623dbcd5bbf5ef", element="b83cda25-404c-46a1-8d8b-9f3a6172172f")>

In [208]:
#Go to state-specific url
driver.quit()
driver = webdriver.Chrome()
driver.get( url[1] ) #enter state-specific url

# Click review button to go to the review section
oncampus_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div/div/button')
oncampus_button.click()

roi_type_button = driver.find_element_by_xpath('//div[@id="roi-list-root"]/div/div/div[3]//button[1]')
roi_type_button.click()

load_more_button = driver.find_element_by_xpath('//*[@class="col-md-4 offset-md-2"]')
load_more_button.click()

financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[2]')
financial_aid_button.click() 

#Wait for entire ROI to load
wait_colleges = WebDriverWait( driver, 10 )
colleges = wait_colleges.until( EC.presence_of_all_elements_located((By.XPATH, \
                            '//*[@class="table college-roi-table table-bordered table-striped table-condensed"]/tbody/tr')) )


In [210]:
college1 = colleges[51]
test = college1.find_element_by_xpath('./td[2]//div').text.split(r'(')
name, ctype = test

In [196]:
#College Name
college1 = colleges[8]
test = college1.find_element_by_xpath('./td[2]//div').text.split(r"(")
name, ctype = test
name
ctype

'In-State)'

In [189]:
college1.find_element_by_xpath('./td[11]/span').text
        

'$82,500'

In [190]:
college1.find_element_by_xpath('./td[13]/span').text


'26%'

In [191]:
college1.find_element_by_xpath('./td[14]/span').text

'5'

In [192]:
college1.find_element_by_xpath('./td[15]/span').text


'$20,300'

In [193]:
college1.find_element_by_xpath('./td[16]/span').text

'40%'

In [215]:
test = [1,2,3]
test[-1]

3