In [None]:
import pandas as pd
import numpy as np
import time
import os

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

## Get use of Parent PLUS loans data from collegedata.com -- step 1

This notebook collects the links to the "Money Matters" page for each public and private college on collegedata.com. The lists are stored as pickle files in the subdirectory step1. 

The process is mostly automated, but the search form must be viewed at the lowest magnification supported by the browser. The selectors at the top of the form and the 
"find" button at the bottom of the form must both be visible for Selenium to run the searches. 

After the page is shrunk, the download loop runs unattended.

In [None]:
# The search landing page
SEARCH_URL =  "https://www.collegedata.com/cs/search/college/college_search_tmpl.jhtml"

# The choices in the search parameters
ITYPE_PRIVATE = "2"
ITYPE_PUBLIC = "1"


In [None]:
profile = webdriver.FirefoxProfile()

profile.set_preference("dom.disable_open_during_load", False)
driver = webdriver.Firefox(firefox_profile=profile)

driver = webdriver.Firefox()
driver.get(SEARCH_URL)

# You MUST shrink the web page before continuing the rest of the notebook!

The entire form must be visible with no scrolling required!

In [None]:
# Build the list of state names
# strings of length two are state abbreviations,
# we just don't want to grab 'No Preference'

In [None]:
STATES = []
button = driver.find_element_by_name("states")
allOptions = button.find_elements_by_tag_name("option")
for option in allOptions:
    value = option.get_attribute('value')
    if len(value) == 2:
        STATES += [value]    

In [None]:
# Choose the search options for the given state and institution type
def select_state_intype(state, intype):
    # The state is a multi-select so make sure we start
    # with a clean slate
    
    select = Select(driver.find_element_by_name("states"))
    select.deselect_all()
    
    button = driver.find_element_by_name("states")
    allOptions = button.find_elements_by_tag_name("option")
    for option in allOptions:
        value = option.get_attribute('value')
        if value == state:
            option.click()
            break
            
    button = driver.find_element_by_name("institutionType")
    allOptions = button.find_elements_by_tag_name("option")
    for option in allOptions:
        value = option.get_attribute('value')
        if value == intype:
            option.click()
            break
    
     

In [None]:
# We're searching for a button that doesn't have a name
# and can move location when the page redraws itself in 
# a shrunken window (or maybe when showing a different ad?)
# There are two buttons labeled find on the page, but
# one searches by name and the other by the drop-downs
# We need to press the second one.
def press_the_search_button():
    buttons = driver.find_elements_by_tag_name("input") 
    for b in buttons:
        html = b.get_attribute('outerHTML')
        idx = html.find('find.gif')
        if idx != -1:
            idx = html.find('checkCollegeName')
            if idx == -1:
                break
    
    b.click()
    return

In [None]:
# Pandas read_html method reads the college name and the dropdown menu items as one field.
# This strips off the dropdown menu to just the college name.
def fix_college_name(broken):
    idx = broken.find(' Overview')
    return broken[:idx]

In [None]:
# Find the link with the href equal to the input college name
# Then, instead of that link (Overview), switch to item 3 (Money Matters)
def get_college_link(college_name):
    link = driver.find_element_by_link_text(college_name)
    href = link.get_attribute('href')
    money_page = href.replace('pg01_tmpl', 'pg03_tmpl')
    return money_page

In [None]:
# Main download loop --

# Get for each state, create a dataframe of college data that includes
# college identification (collegedata doesn't use the government IPEDS id, 
# so I grab name, city, state, and public/private as possible ways of doing a join 
# later) plus the link to that school's "Money Matters" page on collegedata.com

df1_columns = ['College_Name', 'City', 'State', 'Money_url', 'InType' ]

for state in STATES:
    this_state = pd.DataFrame(columns=df1_columns)
    for itype in [ITYPE_PRIVATE, ITYPE_PUBLIC]:
        # Make sure we're on the search page
        driver.get(SEARCH_URL)
        select_state_intype(state, itype)
        press_the_search_button()
        
        # need to do error checking on no results found
        # there are no universities on Guam, for example
        if driver.page_source.find("no matches") != -1:
            print("no matches", state, itype)
            # we will just have an empty dataframe for this iteration
            this_iter = pd.DataFrame(columns=df1_columns)
        else:
            # Suck down the results
            result = pd.read_html(driver.page_source)
            college_names = result[1].loc[:,'College Name'].dropna().map(fix_college_name)
            college_links = college_names.map(get_college_link)
        
            result[2].columns = result[2].columns.droplevel(1)
            college_cities = result[2].loc[:,'City'].dropna()
            college_states = result[2].loc[:,'State'].dropna()
        
            d = { df1_columns[0]:college_names, 
                df1_columns[1]:college_cities,
                df1_columns[2]:college_states,
                df1_columns[3]:college_links}
            this_iter = pd.DataFrame(columns=df1_columns, data=d)
            
            if itype == ITYPE_PUBLIC:
                this_iter['InType'] = 'Public'
            else:
                this_iter['InType'] = 'Private'
            if this_iter.shape[0] == 25:
                print("possible missed data", state, itype)
            this_state = this_state.append(this_iter, ignore_index=True)
        
    path = "./step1/" + state +  ".pickle"
    this_state.reset_index(inplace=True)
    pd.to_pickle(this_state, path)
        

In [None]:
driver.close()