## Import Packages

In [74]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import re
import time
import numpy as np
import pandas as pd 

## Get List of Major URLs

In [75]:
# Open chrome brower and go to parent website containing all majors
driver = webdriver.Chrome()
driver.get('https://www.payscale.com/college-roi/major')

#Isolate body containing all college major info
majors = driver.find_elements_by_xpath('//*[@class="col-xs-12 col-sm-6"]/p')

#Generate nested list containing all college major names and specific urls
major_urls = []
for major in majors:
    #Isolate major name and url
    major_name = major.find_element_by_xpath( './/a' ).text.lower().replace('majors', '').strip()
    major_url = major.find_element_by_xpath( './/a' ).get_attribute("href")
    
    #Store items
    major_urls.append([major_name, major_url])

## Prep Repository DF For State ROI Input

In [76]:
repo_df = pd.DataFrame({'major': [],\
                        'college_name':[],\
                        'college_type':[],\
                        'financial_aid':[],\
                        'roi':[],\
                        'total_cost':[]})

## Scrape Each State

In [77]:
counter = 0
for url in major_urls: 
  
    #Unpack major-specific list containing name and url
    major_name, major_url = url
    print(f"{major_name}: initiate")
    
    #Go to major-specific url
    driver.get( major_url ) #enter state-specific url
    
    # Click review button to view complete dataset
    oncampus_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div/div/button')
    oncampus_button.click()

    roi_type_button = driver.find_element_by_xpath('//div[@id="roi-list-root"]/div/div/div[3]//button[1]')
    roi_type_button.click()

    load_more_button = driver.find_element_by_xpath('//*[@class="col-md-4 offset-md-2"]')
    load_more_button.click()
    
    #Collect ROI data with and without financial information
    for aid in [True, False]:

        #Want to collect data with and without financial aid
        if aid:
            #Store data WITH financial aid
            financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[2]')
            financial_aid_button.click() 
        else:
            #Store data WITHOUT financial aid
            financial_aid_button = driver.find_element_by_xpath('//*[@id="roi-list-root"]/div/div/div[2]//button[1]')
            financial_aid_button.click() 

        #Wait for entire ROI table to load
        wait_colleges = WebDriverWait( driver, 10 )
        colleges = wait_colleges.until( EC.presence_of_all_elements_located((By.XPATH, \
                                    '//*[@class="table college-roi-table table-bordered table-striped table-condensed"]/tbody/tr')) )

        #Scrape info per row (ie. college)
        for college in colleges:

            #Define additional xpaths and scrape college information
            text = college.find_element_by_xpath('./td[2]//div').text #roughly extract text
            college_name =  college.find_element_by_xpath('./td[2]//div').text #join college name
            college_type = re.sub( r'[)$,%(-]','', college_name.lower().split(r'(')[-1]) #clean up college type
            total_cost = re.sub( r'[)$,%(-]','', college.find_element_by_xpath('./td[11]/span').text ) 
            
            #ROI changes based on aid considerations
            if aid:
                roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[9]/span').text )
            else:
                roi = re.sub( r'[$,%]','', college.find_element_by_xpath('./td[7]/span').text )

            #Store all college data into dataframe
            college_df = pd.DataFrame({'major': [major_name],\
                            'college_name':[college_name],\
                            'college_type':[college_type],\
                            'roi':[roi],\
                            'total_cost':[total_cost]})
            
            #Add college_df to repository dataframe
            repo_df = pd.concat([repo_df, college_df],ignore_index=True, axis=0)

            #Keep track of number of colleges
            counter += 1
            
    print(f"{major_name}: complete")
    print('='*25,counter, '='*25)

driver.close()

art: initiate


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




art: complete
business: initiate
business: complete
computer science and math: initiate
computer science and math: complete
economics: initiate
economics: complete
education: initiate
education: complete
engineering: initiate
engineering: complete
humanities: initiate
humanities: complete
nursing: initiate
nursing: complete
philosophy and religious studies: initiate
philosophy and religious studies: complete
political science: initiate
political science: complete
psychology: initiate
psychology: complete
science: initiate
science: complete
social work & criminal justice: initiate
social work & criminal justice: complete


## Export Repository DF As CSV

In [78]:
repo_df.to_csv('college_roi_bymajor.csv', index=None, header=True)