# Summary
This python script reads in a csv file downloaded from https://nces.ed.gov/collegenavigator/ and searches on Google for parking / transportation for those colleges and finds email contact info

In [1]:
# Import selenium to control web browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
import re
import pandas as pd
import time

## Read in CSV

In [2]:
# Read in CSV containing college information
colleges = pd.read_csv('sample_college_list.csv')
colleges.head()

Unnamed: 0,Name,Address,Website,Type,Awards offered,Campus setting,Campus housing,Student population,Undergraduate students,Graduation Rate,Transfer-Out Rate,Cohort Year *,Net Price **,Largest Program,IPEDS ID,OPE ID
0,Bard College,"Annandale Road, Annandale-On-Hudson, New York ...",www.bard.edu,"4-year, Private not-for-profit",Associate's degree|Bachelor's degree|Master's ...,Town: Fringe,Yes,2289,1957,77%,14%,Fall 2013,"$27,652",-,189088,267100
1,Barnard College,"3009 Broadway, New York, New York 10027-6598",www.barnard.edu,"4-year, Private not-for-profit",Bachelor's degree,City: Large,Yes,2631,2631,92%,6%,Fall 2013,"$26,649",-,189097,270800
2,CUNY LaGuardia Community College,"31-10 Thomson Ave, Long Island City, New York ...",www.lagcc.cuny.edu,"2-year, Public",Less than one year certificate|One but less th...,City: Large,No,18555,18555,28%,13%,Fall 2016,"$7,340",-,190628,1005100
3,CUNY Queens College,"65-30 Kissena Blvd, Queens, New York 11367",https://www.qc.cuny.edu,"4-year, Public",Bachelor's degree|Postbaccalaureate certificat...,City: Large,Yes,19923,16866,56%,23%,Fall 2013,"$4,471",-,190664,269000
4,Hamilton College,"198 College Hill Rd, Clinton, New York 13323",www.hamilton.edu,"4-year, Private not-for-profit",Bachelor's degree,Suburb: Midsize,Yes,2012,2012,93%,4%,Fall 2013,"$29,117",-,191515,272800


In [3]:
# Define the google search that needs to be conducted
colleges['google_search'] = '[parking OR transportation] ' + colleges['Name']
colleges.head()

Unnamed: 0,Name,Address,Website,Type,Awards offered,Campus setting,Campus housing,Student population,Undergraduate students,Graduation Rate,Transfer-Out Rate,Cohort Year *,Net Price **,Largest Program,IPEDS ID,OPE ID,google_search
0,Bard College,"Annandale Road, Annandale-On-Hudson, New York ...",www.bard.edu,"4-year, Private not-for-profit",Associate's degree|Bachelor's degree|Master's ...,Town: Fringe,Yes,2289,1957,77%,14%,Fall 2013,"$27,652",-,189088,267100,[parking OR transportation] Bard College
1,Barnard College,"3009 Broadway, New York, New York 10027-6598",www.barnard.edu,"4-year, Private not-for-profit",Bachelor's degree,City: Large,Yes,2631,2631,92%,6%,Fall 2013,"$26,649",-,189097,270800,[parking OR transportation] Barnard College
2,CUNY LaGuardia Community College,"31-10 Thomson Ave, Long Island City, New York ...",www.lagcc.cuny.edu,"2-year, Public",Less than one year certificate|One but less th...,City: Large,No,18555,18555,28%,13%,Fall 2016,"$7,340",-,190628,1005100,[parking OR transportation] CUNY LaGuardia Com...
3,CUNY Queens College,"65-30 Kissena Blvd, Queens, New York 11367",https://www.qc.cuny.edu,"4-year, Public",Bachelor's degree|Postbaccalaureate certificat...,City: Large,Yes,19923,16866,56%,23%,Fall 2013,"$4,471",-,190664,269000,[parking OR transportation] CUNY Queens College
4,Hamilton College,"198 College Hill Rd, Clinton, New York 13323",www.hamilton.edu,"4-year, Private not-for-profit",Bachelor's degree,Suburb: Midsize,Yes,2012,2012,93%,4%,Fall 2013,"$29,117",-,191515,272800,[parking OR transportation] Hamilton College


In [4]:
# Open up a new chrome browser
driver = webdriver.Chrome(executable_path='/Users/ktang/Downloads/chromedriver')

In [5]:
## Define constants

# Maximum wait time before time out error
WAIT_TIME = 10

In [6]:
def get_clickable_links(results):
    # Check which of the links are actually clickable
    clickable_list = []
    for index, result in enumerate(results):
        if result.is_displayed() and result.is_enabled():
            clickable_list.append(index)
#     print('\t{} clickable links found on Google'.format(len(clickable_list)))
    return clickable_list

In [7]:
def click_thru_links(clickable_list, results):
    list_length = len(clickable_list)
    # Click through each of the links
    for i in range(list_length):
        link = clickable_list[i]
        # Check to make sure that we don't go beyond the size of results
        if link < len(results):
            print('\tSearching {} link'.format(i + 1))
            results[link].click()
            print(driver.current_url)

            # Make sure website loads
#             element = WebDriverWait(driver, WAIT_TIME).until(
#                 EC.presence_of_element_located(
#                     (By.CSS_SELECTOR, 'html'))
#             )

            # Go back after clicking the link
            driver.back()

            # Make sure Google search loads
            element = WebDriverWait(driver, WAIT_TIME).until(
                EC.presence_of_element_located(
                    (By.XPATH, '//div[@class="r"]/a/h3'))
            )

            # Refresh the results page because it is not static
            results = driver.find_elements_by_xpath('//div[@class="r"]/a/h3')
            clickable_list = get_clickable_links(results)

In [8]:
def search_google(colleges, driver):
    # Iterate through all colleges
    for index, row in colleges.iterrows():
        print(
            'Searching on Google for parking / transportation email for {}'.format(row['Name']))
        # Search Google
        driver.get('https://www.google.com/search?q=' + row['google_search'])
        # Make sure Google search loads
        element = WebDriverWait(driver, WAIT_TIME).until(
            EC.presence_of_element_located(
                (By.XPATH, '//div[@class="r"]/a/h3'))
        )
        # Get all the Google search results
        results = driver.find_elements_by_xpath('//div[@class="r"]/a/h3')
        print('\t{} potential links to click'.format(len(results)))

        # Check which of the links are actually clickable
        clickable_list = get_clickable_links(results)
        click_thru_links(clickable_list, results)

In [9]:
search_google(colleges, driver)

Searching on Google for parking / transportation email for Bard College
	10 potential links to click
	Searching 1 link
https://blogs.bard.edu/transportation/
	Searching 2 link
https://blogs.bard.edu/transportation/?page_id=5
	Searching 3 link
https://www.bard.edu/dosa/handbook/index.php?aid=10779&sid=669112
	Searching 4 link
https://www.bard.edu/security/vehicles/
	Searching 5 link
https://lli.bard.edu/lli-programs/current-members/parking-on-bard-campus/
	Searching 6 link
https://blogs.bard.edu/transportation/?page_id=6
	Searching 7 link
https://blogs.bard.edu/transportation/?page_id=73
	Searching 8 link
https://blogs.bard.edu/transportation/?page_id=8
	Searching 9 link
https://blogs.bard.edu/transportation/?page_id=78
	Searching 10 link
https://www.bard.edu/dosa/handbook/index.php?aid=1292&sid=716
Searching on Google for parking / transportation email for Barnard College
	10 potential links to click
	Searching 1 link
https://barnard.edu/parking
	Searching 2 link
https://barnard.edu/ad

ElementClickInterceptedException: Message: element click intercepted: Element <h3 class="LC20lb DKV0Md">...</h3> is not clickable at point (329, 635). Other element would receive the click: <div class="TbwUpd NJjxre">...</div>
  (Session info: chrome=84.0.4147.125)


In [None]:
# Close browser
driver.quit()

# Read in text from selenium

In [None]:
driver.get('http://bamaparking.ua.edu/')
elem = driver.find_element_by_css_selector('html')

In [None]:
elem.text

In [None]:
emails = set(re.findall(r'[\w\.-]+@[\w\.-]+', elem.text, re.I))
emails

In [None]:
# Create a function to find emails in the text of a website
def find_emails_website(driver, email_regex):
    # Read the entire wesite
    elem = driver.find_element_by_css_selector('html')
    # Create a set of all emails that were found based on the regex (use re.I to ignore case)
    emails = set(re.findall(email_regex, elem.text, re.I))
    return emails