In [1]:
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import re

In [56]:
pd.options.mode.chained_assignment = None

In [70]:
def process_raw_links(tags):
    links = []
    for tag in tags:
        links.append((str(tag.string), "http://www.collegesimply.com/guides/admission-data/" + tag['href']))
    data = pd.DataFrame.from_records(links, columns=['School', 'Link'])
    return data  

In [82]:
def write_out_school_ids(links):
    '''
    Given the data frame of all links scraped from collegesimply.com, generate
    a CSV mapping names to ids to uniquely identify schools.
    '''
    school_ids = pd.DataFrame(data=links['School'])
    school_ids['id'] = range(len(school_ids))
    school_ids.to_csv("school_ids.csv", index=False)
    return school_ids

In [131]:
def create_school_id_map():
    school_ids = pd.read_csv('school_ids.csv', index_col=False)
    return {row.School: row.id for _, row in school_ids.iterrows()}

In [140]:
def get_stats_from_college_simply(ids):
    college_simply_url = "http://www.collegesimply.com/guides/admission-data/"
    updated_stats = pd.read_html(college_simply_url, attrs={'id': 'applicationData'})[0]
    # Remove extra whitespace from some of the items.
    updated_stats['School'] = updated_stats['School'].apply(lambda s: ' '.join(s.split()))
    updated_stats['ID'] = updated_stats['School'].apply(lambda key: ids[key])
    return updated_stats

In [141]:
def get_links_from_college_simply(ids):
    # Scrape links
    html = urllib.request.urlopen(urllib.request.Request(college_simply_url))
    soup = BeautifulSoup(html, 'html.parser')
    raw_links = process_raw_links(soup.find_all('a', href=re.compile('^/college')))
    # Clean some extra columns.
    links = raw_links[(raw_links['School'] != 'By State') & (raw_links['School'] != 'Find Colleges Nearby')]
    # Remove extra whitespace from some of the items.
    links['School'] = links['School'].apply(lambda s: ' '.join(s.split()))
    links['ID'] = links['School'].apply(lambda key: ids[key])
    return links

In [156]:
def get_full_stats_from_college_simply(ids):
    stats = get_stats_from_college_simply(ids)
    links = get_links_from_college_simply(ids)
    res = stats.merge(links, on='ID')
    assert (res['School_x'] == res['School_y']).all()
    res['School'] = res['School_y']
    del res['School_x']
    del res['School_y']
    return res

In [238]:
def add_ranking_data(ids, data):
    rankings = pd.read_csv('college_stats.csv')
    rankings['College/University'] = rankings['College/University'].apply(lambda s: ' '.join(s.split()))
    # Filter out ranked data for which we have no school ids.
    rankings = rankings[rankings['College/University'].apply(lambda key: key in ids)]
    rankings['ID'] = rankings['College/University'].apply(lambda key: ids[key])
    res = data.merge(rankings, on='ID', how='outer')
    # Assume College/University is redundant
    del res['College/University']
    # Reorder columns
    cols = ["ID","School","School Type","Region","US World Ranking",
            "Acceptance Rate","Average GPA",
            "New EBRW 25th","New EBRW 75th","New Math 25th","New Math 75th",
            "SAT Total 25th","SAT TOTAL 75th","SAT 25","SAT 75",
            "ACT 25","ACT 75","ACT Comp 25th","ACT Comp 75th",
            "Test Optional or Test Flexible", "State","Link"]
    import re
    non_decimal = re.compile(r'[^\d.]+')
    res['Acceptance Rate'] = res['Acceptance Rate'].apply(lambda x: float(non_decimal.sub('', x)) / 100)
    return res[cols].sort_values(['Acceptance Rate'])

In [164]:
schools = create_school_id_map()

In [158]:
college_simply = get_full_stats_from_college_simply(schools)

In [239]:
total_set = add_ranking_data(schools, college_simply)

In [241]:
total_set.to_csv('final_rankings.csv', index=False)