# Summary
This python script reads in a csv file downloaded from https://nces.ed.gov/collegenavigator/ and searches on Google for parking / transportation for the colleges, finding email contact info

- Thank you to https://www.pingshiuanchua.com/blog/post/scraping-search-results-from-google-search for providing helpful code related to web scraping

## Instructions

1. Download this Python notebook onto your computer (into folder of your choice)
2. Go to https://nces.ed.gov/collegenavigator/ and search for colleges / universities based on criteria (e.g., New York State) and click "Show Results"
3. Click "Export Results" and choose CSV to download the data
4. Delete any rows in the CSV file that you don't want contact info for (e.g., colleges with small campus setting). Make sure to delete rows at the end of the file that don't represent info for any colleges (e.g., "* Cohort Year ...") and save CSV file in the same folder as this Python notebook
5. Edit inputs / constants in the cell below
    - Ensure that COLLEGE_DATA_FILE has the name of the CSV file that was downloaded in the previous step
    - Note on current settings: code searches for "parking" or "transportation" department at each college and filters for only edu and org emails
    - There's generally no need to edit NUMBER_OF_RESULTS, USER_AGENT, and TIMEOUT_LIMIT
6. Run notebook and results will be saved in the same folder as this notebook when web scraping completes
7. Contact info is one line per website containing the email in the following format: email(s) (website where email was found);

In [1]:
"""
Section containing inputs / constants needed for code
"""

# File name of csv file downloaded from College Navigator
COLLEGE_DATA_FILE = 'CollegeNavigator_Search_2020-08-17_14.11.02.csv'

# File name of csv output after web scraping completes
OUTPUT_FILE = 'new_york_state_colleges.csv'

# Google search text before name of college
SEARCH_QUERY = '[parking OR transportation] department '

# Regular expression for emails
EMAIL_REGEX = r"[\w\.-]+@[\w-]+\.edu|[\w\.-]+@[\w-]+\.org"

# Number of google search results we want to return per search
NUMBER_OF_RESULTS = 10

# User agent needed to specify it is desktop search results that we want
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"

# Amount of seconds you are willing to spend opening a website / searching for email addresses
TIMEOUT_LIMIT = 10

In [2]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
from multiprocessing import Process, Pool, TimeoutError

## Define functions for searching Google and finding email address

In [3]:
"""
Return all links found after conducting a Google search

Parameters
----------
query: String
    Google search query to be conducted
NUMBER_OF_RESULTS: int
    Number of search results that Google should return for the query
USER_AGENT:
    User agent for requests library (determines whether Google results are desktop v. mobile)

Returns
-------
links: List
    List of all links returned from Google search
"""


def search_google(query, NUMBER_OF_RESULTS, USER_AGENT):
    google_url = "https://www.google.com/search?q=" + \
        query + "&num=" + str(NUMBER_OF_RESULTS)
    google_request = requests.get(google_url, {"User-Agent": USER_AGENT})
    google_results_soup = BeautifulSoup(google_request.text, "html.parser")

    result_div = google_results_soup.find_all('div', attrs={'class': 'ZINbbc'})

    links = []
    for r in result_div:
        # Checks if each element is present, else, raise exception
        try:
            link = r.find('a', href=True)

            # Check to make sure everything is present before appending
            if link != '':
                links.append(link['href'])
        # Next loop if element is not present
        except:
            continue

    return links

In [4]:
"""
Cleans Google search results from search_google() function to just return websites

Parameters
----------
links: List
    List of all links returned from Google search
    
Returns
-------
clean_links: List
    List of all links returned from Google search (cleaned to only include web addresses)
"""


def clean_google_results(links):
    to_remove = []
    clean_links = []
    for i, l in enumerate(links):
        clean = re.search('\/url\?q\=(.*)\&sa', l)

        # Anything that doesn't fit the above pattern will be removed
        if clean is None:
            to_remove.append(i)
            continue
        clean_links.append(clean.group(1))

    print('  {0} google search result(s) found'.format(len(clean_links)))
    return clean_links

In [5]:
"""
Extracts all text elements from a website

Parameters
----------
web_address: String
    Website address to extract text from

Returns
-------
website_text: String
    All text elements from the website (concatenated together with commas as delimiters)
"""


def extract_text(web_address):
    # Download html data from web address
    website_html = requests.get(web_address, USER_AGENT)

    # Create beautiful soup object and find all text elements
    website_soup = BeautifulSoup(website_html.text, "html.parser")
    website_text_list = website_soup.find_all(text=True)

    # Convert list of text elements into string
    website_text = ', '.join(website_text_list)

    return website_text

In [6]:
"""
Extracts all email addresses from string

Parameters
----------
text: String
    Text to search for email addresses

Returns
-------
emails: Set
    Set containing all unique email addresses that were found in the text
"""


def extract_emails(text, EMAIL_REGEX):
    emails = set(re.findall(EMAIL_REGEX, text, re.I))

    return emails

## Read in CSV from College Navigator and search for parking / transportation emails

In [None]:
# Read in CSV containing college information
colleges = pd.read_csv(COLLEGE_DATA_FILE)

In [None]:
# Define the google search that needs to be conducted
colleges['google_search'] = SEARCH_QUERY + colleges['Name']
# Add empty column for contact info
colleges['contact_info_found'] = ''

In [None]:
# Iterate through all colleges, searching google


for index, row in colleges.iterrows():
    print(
        'Searching on Google for parking / transportation emails for {0}'.format(row['Name']))
    
    
    query = row['google_search']
    links = search_google(query, NUMBER_OF_RESULTS, USER_AGENT)
    clean_links = clean_google_results(links)
    
    emails_list = ''
    
    # Iterate over clean links to find email addresses
    for link in clean_links:
        # Create a new pool which allows for timeouts
        pool1 = Pool(processes=1)
        
        # Try reading the text from websites
        try:
            link_text = pool1.apply_async(extract_text, (link,)).get(timeout=TIMEOUT_LIMIT)
            pool1.close()
            
            # Create another pool which allows for timeouts
            pool2 = Pool(processes = 1)
            
            # Try to extract emails from text
            try:
                email_set = pool2.apply_async(extract_emails, (link_text, EMAIL_REGEX)).get(timeout=TIMEOUT_LIMIT)
                pool2.close()
            
            # If there's a timeout error, just return an empty email set
            except TimeoutError:
                pool2.terminate()
                print('    Took too long to search text for emails in {0}'.format(link))
                email_set = {}
            
            # Calculate how many emails were found
            emails_found = len(email_set)
            print('    Found {0} email(s) at {1}'.format(emails_found, link))

            # If emails were found, add to list
            if emails_found > 0:
                emails_list += ', '.join(email_set) + ' (' + link + ');\n'
        
        # If there are any exceptions, skip and move on to the next link
        except:
            pool1.terminate()
            print('    Took too long / error with loading {0}'.format(link))
            continue
    
    # Add any emails that were found to the dataframe
    colleges.at[index, 'contact_info_found'] = emails_list.strip()
    print()

print('Scraping complete')

In [10]:
# Output results to CSV
colleges.to_csv(OUTPUT_FILE, index=False)
print('Saved data to csv')

Saved data to csv
