In [1]:
import bs4
from IPython.display import IFrame, HTML
import os
import pandas as pd
import posixpath
import re
import requests
from requests.compat import urljoin, urlparse

# CS 109A/STAT 121A/AC 209A/CSCI E-109A: Final

**Harvard University**<br/>
**Fall 2017**<br/>
**Instructors**: Pavlos Protopapas, Kevin Rader, Rahul Dave, Margo Levine

----

_Web scraping component of Crime project._

In order to develop a predictive model for crime in the US, we must first digest the data available to convert it to a usable, standardized format. Our first goal, then, is to scrape data from UCI crime tables from each yearly report published as html online.

In [2]:
baseurl = 'https://ucr.fbi.gov/ucr-publications'
req = requests.get(baseurl)
soup = bs4.BeautifulSoup(req.text, 'html.parser')

In [3]:
# Get links matching a 4-digit year and check if they are for the "Crime in the US" Report
year_links = soup.find_all(text=re.compile('\d{4}'), name='a')

# Keep track of the top-level links
crime_main_urls = []
for link in year_links:
    if 'crime-in-the-u.s' in link['href'] or 'cius' in link['href']:
        crime_main_urls.append(link['href'])
crime_main_urls

['https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016',
 'https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015',
 'https://ucr.fbi.gov/crime-in-the-u.s/2014/crime-in-the-u.s.-2014/cius-home',
 'https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/cius-home',
 'https://ucr.fbi.gov/crime-in-the-u.s/2012/crime-in-the-u.s.-2012',
 'https://ucr.fbi.gov/crime-in-the-u.s/2011/crime-in-the-u.s.-2011',
 'https://ucr.fbi.gov/crime-in-the-u.s/2010/crime-in-the-u.s.-2010',
 'http://www2.fbi.gov/ucr/cius2009/rankingmessage.htm',
 'http://www2.fbi.gov/ucr/cius2008/index.html',
 'http://www2.fbi.gov/ucr/cius2007/index.html',
 'http://www2.fbi.gov/ucr/cius2006/index.html',
 'http://www2.fbi.gov/ucr/05cius/',
 'http://www2.fbi.gov/ucr/cius_04/',
 'https://ucr.fbi.gov/crime-in-the-u.s/2003',
 'https://ucr.fbi.gov/crime-in-the-u.s/2002',
 'https://ucr.fbi.gov/crime-in-the-u.s/2001',
 'https://ucr.fbi.gov/crime-in-the-u.s/2000',
 'https://ucr.fbi.gov/crime-in-the-u.s/1

In [4]:
def navigate_to_msa_table(year_url):
    '''Follow a prescribed garden path to find the MSA table url for a given year report'''

    # Strip the current link to just the home page (special handling of warning)
    if year_url.endswith('rankingmessage.htm'):
        year_url = year_url.replace('/rankingmessage.htm', '')
    if year_url.endswith('index.html'):
        year_url = year_url.replace('/index.html', '')
    
    # Fetch and parse the page for this year
    req = requests.get(year_url)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    # Pull out the (first) link that mentions Violent Crime and clean the link
    crime_url = soup.find(string=re.compile('Violent'), name='a')['href']
    if not crime_url.startswith('http'):
        crime_url = '/'.join([year_url, crime_url])
    if crime_url.endswith('index.html'):
        crime_url = crime_url.replace('/index.html', '/')
    
    # Fetch and parse the Violent Crime page
    req = requests.get(crime_url)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    
    # Pull out the link for metropolitain statistical areas
    msa_url = soup.find(string=re.compile('Metropoliti?an Statistical Area'), name='a')['href']
    if msa_url.startswith('.'):
        msa_url = crime_url + msa_url

    return resolveComponents(msa_url)


def resolveComponents(url):
    """
    Mostly copied from https://stackoverflow.com/a/4317446/218118
    
    >>> resolveComponents('http://www.example.com/foo/bar/../../baz/bux/')
    'http://www.example.com/baz/bux/'
    >>> resolveComponents('http://www.example.com/some/path/../file.ext')
    'http://www.example.com/some/file.ext'
    """

    parsed = urlparse(url)
    new_path = posixpath.normpath(parsed.path)
    if parsed.path.endswith('/'):
        # Compensate for issue1707768
        new_path += '/'
    cleaned = parsed._replace(path=new_path)
    return cleaned.geturl()

msa_urls = []

for year_url in crime_main_urls:
    try:
        msa_urls.append(navigate_to_msa_table(year_url))
    except TypeError:
        print('-'*10 + ' Error with top-level url - {}'.format(year_url))
msa_urls

---------- Error with top-level url - http://www2.fbi.gov/ucr/05cius/
---------- Error with top-level url - http://www2.fbi.gov/ucr/cius_04/
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/2003
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/2002
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/2001
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/2000
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/1999
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/1998
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/1997
---------- Error with top-level url - https://ucr.fbi.gov/crime-in-the-u.s/1996


['https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/tables/table-4',
 'https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015/tables/table-6',
 'https://ucr.fbi.gov/crime-in-the-u.s/2014/crime-in-the-u.s.-2014/tables/table-6',
 'https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/6tabledatadecpdf/table-6',
 'https://ucr.fbi.gov/crime-in-the-u.s/2012/crime-in-the-u.s.-2012/tables/6tabledatadecpdf',
 'https://ucr.fbi.gov/crime-in-the-u.s/2011/crime-in-the-u.s.-2011/tables/table-6',
 'https://ucr.fbi.gov/crime-in-the-u.s/2010/crime-in-the-u.s.-2010/tables/table-6',
 'http://www2.fbi.gov/ucr/cius2009/data/table_06.html',
 'http://www2.fbi.gov/ucr/cius2008/data/table_06.html',
 'http://www2.fbi.gov/ucr/cius2007/data/table_06.html',
 'http://www2.fbi.gov/ucr/cius2006/data/table_06.html']

Now that we have links to the report tables for each year, parse them to grab relevant details. The format of the table is actually variable, but in general:

* "Major" rows are 1 per MSA. This takes many differnt forms in the html. 

Per MSA:

* Row 1: the header (MSA name) and total population
* Row 2: Empty except counties' names in column 2 (but this changes depending on format of html headers)
* Row 3-X: full stats for major cities (one row per city)
* 4 (generally): full stats for total reporting
* 5 (when not 100% fully reporting): full stats for estimated 100% reporting
* 5: Normalized rate per 100,000 of total reporting


In [5]:
year_pat = re.compile('(\d{4})')
def parse_msa_table(table_url):
    # Retrieve the table
    req = requests.get(table_url)
    soup = bs4.BeautifulSoup(req.text, 'html.parser')
    tab1 = soup.find(name='table', class_='data')
    
    # Extract column names. There are lots of "presentation" tricks that the html reports'
    # devlopers used (including spaces and new-line characters in the column headers) that
    # are interpeted strangely by the beautifulsoup parser and must be normalized;
    # otherwise columns that appear to be identical strings are actaully saved separately.
    header_cells = tab1.find('thead').find_all('th')[1:]
    table_cols = [re.sub('\s+', ' ', t.get_text(separator=' ').strip()) for t in header_cells]

    # Parse the MSA info and create a dataframe
    rows = tab1.find('tbody').find_all('tr')
    msa_list = []
    current_msa = None
    for row in rows:
        try:
            current_msa = parse_msa_row(row, current_msa, msa_list, table_cols) 
        except TypeError as e:
            # Some rows are empty, containing only empty tds with no th cells.
            # Continue to the next row in that case.
            print(e)
            continue


    # Combine each MSA's stats into a single large dataframe
    all_msa_df = pd.concat(msa_list)

    # Read the year from the url and add it to the df
    year = year_pat.search(table_url).groups()[0]
    all_msa_df['year'] = year
    all_msa_df = all_msa_df.convert_objects(convert_numeric=True)
    
    # Collapse and rename some known problem columns (inconsistent across years)
    sanitize_columns(all_msa_df, 'Larceny', 'Larceny')
    sanitize_columns(all_msa_df, 'Rape', 'Rape')

    return all_msa_df


def sanitize_columns(df, query_label, new_column_label, inplace=True):
    '''Search for a column and rename it to a new saner column name'''
    cols = df.columns
    query_cols = cols[cols.to_series().str.contains(query_label, case=False)]
    if len(query_cols) > 1:
        len_msg = ('Unexpected columns found - should only have one {} '
                   'column, but got {}'.format(query_label, ', '.join(query_cols)))
        raise StandardError(len_msg)
    query_col = query_cols[0]
    updated = df.rename(columns={query_col: new_column_label}, inplace=inplace)
    
    # Updated contains the return value of df.rename, which will be either
    # the new dataframe or None depending on the value of inplace
    return updated


def parse_msa_row(row, current_msa, msa_list, table_cols):
    '''Parse a row and adjust the current msa as appropriate.
    
    Since the MSA object is really a set of rows, and those rows aren't well defined (MDs) interspersed,
    the row parser needs the current row, the current msa (to adjust attributes), and the list of 
    all MSAs (MSA dicts are only added to the list when the previous MSA has been completed and a new
    header has been found). Updates MSA list and current MSA in place; returns None '''
    row_type = determine_row_type(row)
    
    if row_type == 'header':
        if current_msa:
            msa_df = msa_to_df(current_msa, table_cols)
            msa_list.append(msa_df)
        current_msa = parse_header_row(row)
    elif row_type == 'county_list':
        counties = parse_county_row(row)
        current_msa['counties'] = counties
    elif row_type == 'table_row':
        counts = parse_numerical_row(row)
        current_msa['counts'].append(counts)
    else:
        raise StandardError('Unrecognized row type: {}'.format(row_type))
    return current_msa


def determine_row_type(row):
    '''Scan a html row and determine which function should parse it.
    
    Primary row types:
    - 
    - 
    '''
    th = row.find('th')
    if len(th['class']):
        if in_list(th['class'], ['subguide1', 'subguide2', 'group0']) and len(th.text.strip()):
            # subguide1 is for MSA; subguide2 is for MD (Metropolitan Division), smaller breakdowns of MSAs
            # In later years, "groupN" is the 0-indexed column number. Group 0 is the first column, or header
            # In 2010, a th in col0 appears for some numeric columns.
            row_type = 'header'
        # Check if any item in the header's classes is a member of the following header groups
        # This list comprehension is the fastest way to check intersection
        else:  # in_list(th['class'], ['subguide1e', 'group1']):
            if 'Includes' in th.text or 'Includes' in row.find('td').text:
                row_type = 'county_list'
            else:
                row_type = 'table_row'
    return row_type


def in_list(query_list, target_list):
    '''Return True if any member of ListA is in ListB'''
    return bool(len(list([i for i in query_list if i in target_list])))


def parse_header_row(row):
    th = row.find('th')
    msa_label = th.text.strip()
    try:
        pop_idx = 0
        msa_pop = parse_pop(row, pop_idx)
    except ValueError:
        pop_idx = 1
        msa_pop = parse_pop(row, pop_idx)

    new_msa = dict(msa_label=msa_label, msa_pop=msa_pop, counts=[])
    return new_msa


def parse_pop(row, pop_idx):
    return int(row.find_all('td')[pop_idx].text.strip().replace(',', ''))


def parse_county_row(row):
    counties = row.find('th').text.strip()
    return counties


def parse_numerical_row(row):
    label = row.find('th').text.strip()
    cols = [column_value.text.replace(',', '').replace('%', '').strip() for column_value in row.find_all('td')]
    if len(row.find_all('th')) == 2:
        cols.insert(0, '')
    if len(label):
        cols.insert(0, label)
    return cols


def msa_to_df(current_msa, table_cols):
    df = pd.DataFrame([c for c in current_msa['counts']], columns=table_cols)
    for k in set(current_msa.keys()) - set(['counts']):
        df[k] = current_msa[k]
    return df.convert_objects(convert_numeric=True)

msa_dfs = []
for url in msa_urls:
    print(url)
    msa_dfs.append(parse_msa_table(url))

all_year_dfs = pd.concat(msa_dfs, ignore_index=True)
all_year_dfs.describe()

https://ucr.fbi.gov/crime-in-the-u.s/2016/crime-in-the-u.s.-2016/tables/table-4




https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015/tables/table-6
https://ucr.fbi.gov/crime-in-the-u.s/2014/crime-in-the-u.s.-2014/tables/table-6
https://ucr.fbi.gov/crime-in-the-u.s/2013/crime-in-the-u.s.-2013/tables/6tabledatadecpdf/table-6
https://ucr.fbi.gov/crime-in-the-u.s/2012/crime-in-the-u.s.-2012/tables/6tabledatadecpdf
https://ucr.fbi.gov/crime-in-the-u.s/2011/crime-in-the-u.s.-2011/tables/table-6
https://ucr.fbi.gov/crime-in-the-u.s/2010/crime-in-the-u.s.-2010/tables/table-6
http://www2.fbi.gov/ucr/cius2009/data/table_06.html
http://www2.fbi.gov/ucr/cius2008/data/table_06.html
'NoneType' object is not subscriptable
http://www2.fbi.gov/ucr/cius2007/data/table_06.html
http://www2.fbi.gov/ucr/cius2006/data/table_06.html


Unnamed: 0,Aggravated assault,Burglary,Larceny,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape,Robbery,Violent crime,msa_pop,year
count,16092.0,16146.0,16136.0,16187.0,16258.0,12155.0,15976.0,16015.0,16255.0,15865.0,16262.0,16262.0
mean,1129.884004,2686.503029,8598.836211,1258.15387,24.734279,84367.47,12415.848579,129.201611,659.827579,1919.413508,1305731.0,2011.046796
std,3142.516418,6228.53273,20757.650086,3824.970104,74.903191,318227.1,30048.212887,293.164424,2297.988234,5627.500873,2620349.0,3.143743
min,0.0,4.0,6.0,0.0,0.0,75.2,94.0,0.0,0.0,1.0,33156.0,2006.0
25%,145.2,446.125,1664.15,115.0,2.0,100.0,2312.45,22.7,48.0,240.8,154139.0,2008.0
50%,292.5,815.95,2536.15,236.0,5.0,18742.0,3622.7,41.0,106.0,454.0,337011.0,2011.0
75%,709.0,1888.0,5824.5,615.0,13.0,76079.0,8239.25,95.15,296.95,1127.0,1006870.0,2014.0
max,44100.0,68227.0,264374.0,75245.0,1092.0,8566917.0,357850.0,4968.0,37129.0,82359.0,20177990.0,2016.0


In [6]:
print(all_year_dfs.shape)
all_year_dfs.head()

(16262, 15)


Unnamed: 0,Aggravated assault,Burglary,Counties/principal cities,Larceny,Motor vehicle theft,Murder and nonnegligent manslaughter,Population,Property crime,Rape,Robbery,Violent crime,counties,msa_label,msa_pop,year
0,335.0,905.0,City of Abilene,3113.0,251.0,8.0,122523.0,4269.0,70.0,133.0,546.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
1,387.0,1104.0,Total area actually reporting,3530.0,298.0,11.0,100.0,4932.0,79.0,137.0,614.0,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
2,227.8,649.9,"Rate per 100,000 inhabitants",2077.9,175.4,6.5,,2903.1,46.5,80.6,361.4,"Includes Callahan, Jones, and Taylor Counties","Abilene, TX M.S.A.",169885,2016
3,555.0,2483.0,City of Akron,5483.0,680.0,34.0,197257.0,8646.0,212.0,415.0,1216.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016
4,833.0,3865.0,Total area actually reporting,12599.0,989.0,41.0,96.2,17453.0,335.0,530.0,1739.0,Includes Portage and Summit Counties,"Akron, OH M.S.A.2",703561,2016


The primary output of the scraping is a **AC209_RawScraped.csv**, a single csv table where all year's data is stored. This is more or less a mirror of all the tables in all the reports, regardless of row type (raw, 100% estimated, or normalized values, Metro/Micro statistical area, etc.)

In [7]:
all_year_dfs.to_csv('data/derived/AC209_RawScraped.csv', encoding='utf_8')