In [None]:
import os
from os.path import join as opj
import requests
from bs4 import BeautifulSoup
from requests import HTTPError, ConnectionError
import numpy as np
import pandas as pd
from sodapy import Socrata
import math

I don't currently know why I have to do the following to get `lxml` to work inside the container, but a variety of changes to the `jupyter/Dockerfile` specification did not work and the first five StackOverflow suggestions didn't obviate the need for this.

In [None]:
!pip install lxml

It's a total hack, I need (for my own sake) to find out what's preventing this, but it works for now.

### Data Sources

In [None]:
POPULATION_DATASET_URL = ('https://s3.amazonaws.com/SplitwiseBlogJB/'
                          '2010+Census+Population+By+Zipcode+(ZCTA).csv')

NYC_ZIPCODES_DATASET_URL = ('https://www.health.ny.gov/statistics/cancer/'
                            'registry/appendix/neighborhoods.htm')

COMPLAINTS_DATASET_URL = 'data.cityofnewyork.us'
DATASET_ID = 'fhrw-4uyv' 

### 311 Data Utility Functions

In [None]:
def get_n_records(year=2017):
    query = """
    select count(unique_key)
    where date_extract_y(created_date) = 2017
    """
    return int(client.get(DATASET_ID, query=query)[0]['count_unique_key'])

get_n_records()

We need to download about 3 million records for 2017.

In [None]:
def download_311_data():
    with Socrata(COMPLAINTS_DATASET_URL, None, timeout=90) as client:
        loop_size = 2000
        n_loops = 1500
        for i in range(n_loops):
            if i % 10 == 0:
                print(f"Retrieving batch {i}...")
            results = client.get(                                                  
                DATASET_ID,                                                        
                select=('unique_key,complaint_type,descriptor,borough,city,'
                        'incident_zip,latitude,longitude'),                      
                where='date_extract_y(created_date)=2017',                         
                limit=loop_size,                                                  
                offset=loop_size*i
            )
            pd.DataFrame.from_records(results).to_csv(f'../data/raw/nyc-311-complaints-2017-{i}.csv', index=False)

In [None]:
def get_population_by_zip(url):
    """ Retrieves 2010 Census population by ZIP code data """
    try:
        population_by_zip = pd.read_csv(url)
        return population_by_zip
    except HTTPError as e:
        print(e)
        
def scrape_nyc_zips(url):
    """ Scrapes table of NYC zipcodes from New York State Department
    of Health website """
    try:
        r = requests.get(url)
        return r
    except HTTPError as e:
        print("NYC neighborhood ZIP code lookup table not found:", e)


# TODO: Refactor to have utility functions for, for example, the
# "tidying" aspects and the conversion aspects ... and rename this
# function to something more sensible
def tidy_nyc_zips(html):
    """ Wrangle HTML table of NYC ZIP codes into a "tidy" data frame

    Args:
        html (requests.models.Response):

    Returns:
        pandas.DataFrame:
    """

    # TODO: This seems too ugly and hacky so find a more elegant
    # solution
    borough_zips = (
        pd.read_html(html.content, header=0)[0]
          .reset_index()
    )

    borough_zips.loc[borough_zips['ZIP Codes'].isnull(), 'Borough'] = np.nan
    borough_zips.loc[:, 'ZIP Codes'] = \
        borough_zips.loc[:, 'ZIP Codes'].str.replace(' ', '')

    borough_zips.loc[:, 'ZIP Codes'] = (
        borough_zips.loc[:, 'ZIP Codes']
                    .combine_first(borough_zips['Neighborhood'])
    )

    # TODO: keep the neighborhood information, even though it's not
    # currently necessary for this analysis
    borough_zips.drop('Neighborhood', axis=1, inplace=True)
    borough_zips.loc[:, 'Borough'] = \
        borough_zips.loc[:, 'Borough'].ffill()

    # Overwrite the comma-separated string "list" in the cell
    # with an actual list of integers
    borough_zips.loc[:, 'ZIP Codes'] = (
        borough_zips.loc[:, 'ZIP Codes']
                    .apply(lambda x: x.split(','))
    )

    # TODO: Write utility function for this pattern
    borough_zips = (
        borough_zips.set_index(['index', 'Borough'])
                    .loc[:, 'ZIP Codes']
                    .apply(pd.Series) # Expand the list of 
                    .stack()
                    .reset_index()
    )

    borough_zips.drop(['index', 'level_2'], axis=1, inplace=True)
    borough_zips.columns = \
        'borough zip_code'.split(' ')
    borough_zips.loc[:, 'zip_code'] = \
        borough_zips.loc[:, 'zip_code'].astype(int)

    return borough_zips

## Data Wrangling

### NYC 311 Complaints 2017

In [None]:
download_311_data()

In [None]:
files = !ls *.csv  # TODO: fix because this is brittle
frames = {}
for file_ in files:
    frames[file_] = pd.read_csv(opj('../data/raw/', file_))
    
complaints = pd.concat(frames.values(), ignore_index=True)

In [None]:
complaints.loc[:, 'unique_key'] = \
    complaints.loc[:, 'unique_key'].astype(int)

complaints.loc[:, 'borough'] = (
    complaints.loc[:, 'borough']
              .str.title()
              .astype('category')
)

complaints.loc[:, 'city'] = (
    complaints.loc[:, 'city']
              .str.title()
              .astype('category')
)

complaints.loc[:, 'complaint_type'] = \
    complaints.loc[:, 'complaint_type'].astype('category')

complaints.loc[:, 'incident_zip'] = \
    complaints.loc[:, 'incident_zip'].apply(pd.to_numeric, errors='coerce')

complaints.loc[:, 'created_date'] = \
    complaints.loc[:, 'created_date'].apply(pd.to_datetime)

In [None]:
complaints = complaints.replace('Unspecified', np.nan)

is_borough = complaints['city'].isin(['Bronx', 'Brooklyn', 'Manhattan',
                                      'Queens', 'Staten Island'])
complaints.loc[~is_borough, 'city'] = np.nan

complaints.loc[:, 'borough'] = (
    complaints.loc[:, 'borough']
              .combine_first(complaints['city'])
)

complaints.drop('city', axis=1, inplace=True)

### 2010 Census Population by ZIP Code

In [None]:
population_by_zip = get_population_by_zip(POPULATION_DATASET_URL)
population_by_zip.columns = 'zip_code population'.split(' ')

### NYC ZIP Codes

There are some non-NYC ZIP codes in the dataset, so we'd like to safely filter those out.

In [None]:
html = scrape_nyc_zips(NYC_ZIPCODES_DATASET_URL)  

In [None]:
html = scrape_nyc_zips(NYC_ZIPCODES_DATASET_URL)
nyc_zips = tidy_nyc_zips(html)

In [None]:
nyc_zips.sample(10)

In [None]:
population_by_zip_nyc = nyc_zips.merge(
    population_by_zip,
    on='zip_code',
    how='inner'
)

In [None]:
population_by_zip_nyc.sample(10)

In [None]:
population_by_borough = (
    population_by_zip_nyc.groupby('borough', as_index=False)
                          .population
                           .sum()
)
population_by_borough

According to Google, NYC had a population of 8.194 million in 2010. The scraped and merged dataset says that the 2010 population is:

In [None]:
population_by_borough['population'].sum()

It isn't exactly the same, but it's close enough for the present purposes to ignore futher investigation.

In [None]:
population_by_zip_nyc.to_csv('../data/cleaned/population-by-zip-nyc-2010.csv', index=False)

In [None]:
complaints = complaints.merge(
    population_by_zip_nyc[['borough', 'population']],
    left_on='incident_zip',
    right_on='zip_code',
    how='inner'
)

complaints.loc[:, 'borough'] = (
    complaints.loc[:, 'borough_x']
              .combine_first(complaints['borough_y'])
)

complaints.drop(['borough_x', 'borough_y'], axis=1, inplace=True)

In [None]:
column_order = [
    'unique_key', 'created_date', 'borough', 'zip_code', 
    'latitude', 'longitude', 'complaint_type', 'descriptor'
]
                
complaints[column_order].to_parquet(
    'data/cleaned/nyc-311-complaints-raw.csv',
    index=False
)