## Running all webscrapping scripts
Importing moduels:

In [182]:
import requests

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import re

import os
from datetime import datetime

The file placement_univ_acronym contains info like: full name of each university; the location; ranks from usnews; links of econ placement; some other notes; acronyms of each university.

In [183]:
links = pd.read_excel("placement_univ_acronym.xlsx")
links.head()

Unnamed: 0,university,place,usnews,link,note,acronym
0,Harvard University,"Cambridge, MA",#1 in Economics (tie),https://economics.harvard.edu/placement,,harvard
1,Massachusetts Institute of Technology,"Cambridge, MA",#1 in Economics (tie),https://economics.mit.edu/graduate/career,pdf,mit
2,Princeton University,"Princeton , NJ",#1 in Economics (tie),https://economics.princeton.edu/graduate-progr...,pdf,princeton
3,Stanford University,"Stanford, CA",#1 in Economics (tie),https://economics.stanford.edu/graduate/studen...,,stanford
4,University of California--Berkeley,"Berkeley, CA",#1 in Economics (tie),https://www.econ.berkeley.edu/grad/program/pla...,noname,berkley


Functions to use in webscrapping:

In [184]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def find_pattern(s, srt, end):
    
    pattern = r'(?<=' + srt + ').*(?=' + end + ')'
    name_pattern = re.compile(pattern, flags = re.M)
    names = name_pattern.findall(s)
    
    return(names)

def assign_year(s, names, year):
    
    name_idx = [s.find(i) for i in names]
    year_idx = [s.find(i) for i in year]

    year_idx.append(len(s))
    name_year = []

    for i in name_idx:
        name_year.extend([year[j] for j in range(len(year_idx)) if i >= year_idx[j] and i < year_idx[j + 1]])
        
    return(name_year)

def fast_parse_1(html, name_parser, placement_parser, year_parser):
    
    # print(html)

    # read the content of the placement website
    raw_html = simple_get(html)
    html = BeautifulSoup(raw_html, 'html.parser')

    # store the year and placement info
    names = []
    year = []
    fileds = []
    placement = []

    s = str(html)
    
    srt, end = placement_parser
    placement = find_pattern(s, srt, end)
    # print('Length of placement list: {}'.format(len(placement)))

    if name_parser != [None, None]:
        srt, end = name_parser
        names = find_pattern(s, srt, end)
        # print('Length of name list: {}'.format(len(names)))
    else:
        names = [None] * len(placement)
        
        srt, end = year_parser
        year = find_pattern(s, srt, end)
        year_new = [srt + i for i in year]

        name_year = assign_year(s, placement, year_new)
        name_year = [i.replace(srt, '') for i in name_year]

        tmp = [[name_year[i], names[i], placement[i]] for i in range(len(name_year))]
        tmp = pd.DataFrame(tmp)
        tmp.columns = ['Year', 'Name', 'Placement']

        return(tmp)
    
    if len(names) == len(placement):

        srt, end = year_parser
        year = find_pattern(s, srt, end)
        year_new = [srt + i for i in year]

        name_year = assign_year(s, names, year_new)
        name_year = [i.replace(srt, '') for i in name_year]

        tmp = [[name_year[i], names[i], placement[i]] for i in range(len(name_year))]
        tmp = pd.DataFrame(tmp)
        tmp.columns = ['Year', 'Name', 'Placement']

        return(tmp)
    
    else:
        
        return([placement, names])
    
def fast_parse_2(html, name_parser, placement_parser, year_parser):
    
    # print(html)

    # read the content of the placement website
    raw_html = simple_get(html)
    html = BeautifulSoup(raw_html, 'html.parser')

    # store the year and placement info
    names = []
    year = []
    placement = []

    s = str(html)
    
    srt, end = placement_parser
    placement = find_pattern(s, srt, end)
    # print('Length of placement list: {}'.format(len(placement)))
    
    srt, end = name_parser
    names = find_pattern(s, srt, end)
    # print('Length of name list: {}'.format(len(names)))
   
    srt, end = year_parser
    year = find_pattern(s, srt, end)
    # print('Length of year list: {}'.format(len(year)))
    
    if len(names) == len(placement) and len(names) == len(year):

        tmp = [[year[i], names[i], placement[i]] for i in range(len(names))]
        tmp = pd.DataFrame(tmp)
        tmp.columns = ['Year', 'Name', 'Placement']

        return(tmp)
    
    else:
        
        return([placement, names, year])
    
    
def print_out_html(html):
    
    # print(html)

    # read the content of the placement website
    raw_html = simple_get(html)
    html = BeautifulSoup(raw_html, 'html.parser')

    s = str(html)
    
    return(s)

For each university with a valid webscrapping scripts, save the econ placement csv files.

In [211]:
# acquire the file directory and all websrapping scripts
for (dirpath, dirnames, filenames) in os.walk('websracp_univ'):  
    dr, fn_li = dirpath, filenames
    
for fn in fn_li:
    
    str_command = "exec(open('" + str(dirpath) + "//" + str(fn) + "').read())"
    exec(str_command)
    
    univ = fn.split('.')[0]
    print('Processing university: {}'.format(univ))
    str_command = str(univ) + "['Acronym'] = '" + str(univ) + "'" 
    exec(str_command)
    
    str_command = str(univ) + ".to_csv('data_by_univ//" + str(univ) + ".csv', index = False)" 
    exec(str_command)

Processing university: american
Processing university: bc
Processing university: bu
Processing university: clemson
Processing university: columbia
Processing university: cub
Processing university: duke
Processing university: fsu
Processing university: gsu
Processing university: harvard
Processing university: iub
Processing university: lsu
Processing university: missouri
Processing university: msu
Processing university: nyu
Processing university: stanford
Processing university: ucb
Processing university: uci
Processing university: ucla
Processing university: uconn
Processing university: ucsd
Processing university: uh
Processing university: uiuc
Processing university: ukansas
Processing university: uky
Processing university: umn
Processing university: upenn
Processing university: uta
Processing university: utah
Processing university: utk
Processing university: uva
Processing university: vandy
Processing university: vt


Combine all saved csvs into one:

In [212]:
for (dirpath, dirnames, filenames) in os.walk('data_by_univ'):  
    dr, fn_li = dirpath, filenames
    
df = pd.DataFrame()

for fn in fn_li:    
    str_command = "df1 = pd.read_csv('" + str(dirpath) + "//" + str(fn) + "')"
    exec(str_command)
    # print(str_command)
    # print(df1.columns)
    df = df.append(df1)
    
print('Number of obs: {}'.format(len(df)))

Number of obs: 4196


Save the combined placement dataset:

In [213]:
add_date = True

if add_date:
    filename = 'data//data' + datetime.today().strftime('%Y-%m-%d') + '.csv'
else:
    filename = "data//data.csv"

df.to_csv(filename, index = False)

This is a list of universities contained in our sample:

In [214]:
df['Acronym'].unique()

array(['american', 'bc', 'bu', 'clemson', 'columbia', 'cub', 'duke',
       'fsu', 'gsu', 'harvard', 'iub', 'lsu', 'missouri', 'msu', 'nyu',
       'stanford', 'ucb', 'uci', 'ucla', 'uconn', 'ucsd', 'uh', 'uiuc',
       'ukansas', 'uky', 'umn', 'upenn', 'uta', 'utah', 'utk', 'uva',
       'vandy', 'vt'], dtype=object)