In [1]:
!ls data/finance_sales

2003_bronx.xls        2007_statenisland.xls 2012_queens.xls
2003_brooklyn.xls     2008_bronx.xls        2012_statenisland.xls
2003_manhattan.xls    2008_brooklyn.xls     2013_bronx.xls
2003_queens.xls       2008_manhattan.xls    2013_brooklyn.xls
2003_statenisland.xls 2008_queens.xls       2013_manhattan.xls
2004_bronx.xls        2008_statenisland.xls 2013_queens.xls
2004_brooklyn.xls     2009_bronx.xls        2013_statenisland.xls
2004_manhattan.xls    2009_brooklyn.xls     2014_bronx.xls
2004_queens.xls       2009_manhattan.xls    2014_brooklyn.xls
2004_statenisland.xls 2009_queens.xls       2014_manhattan.xls
2005_bronx.xls        2009_statenisland.xls 2014_queens.xls
2005_brooklyn.xls     2010_bronx.xls        2014_statenisland.xls
2005_manhattan.xls    2010_brooklyn.xls     2015_bronx.xls
2005_queens.xls       2010_manhattan.xls    2015_brooklyn.xls
2005_statenisland.xls 2010_queens.xls       2015_manhattan.xls
2006_bronx.xls        2010_statenisland.xls 2015_queens

In [2]:
import platform
print(platform.python_version())

3.5.2


In [3]:
import pandas as pd
import numpy as np

In [4]:
boros = ['manhattan', 'bronx', 'brooklyn', 'queens', 'statenisland']
years = range(2003, 2017)
boro_codes = dict(zip(boros, range(1,6)))

In [5]:
def read_in_boro_year_data(boro, year, data_dir = "data/finance_sales"):
    """
    Fetches data file for a specified boro and year, and returns the data as a
    Pandas dataframe. Checks integrity of boro/year arguments.

    Args:
        string boro: name of boro for desired data
        int year: year of desired data
    Returns:
        Pandas DataFrame
    """
    # Acceptable inputs
    boros = ['manhattan', 'bronx', 'brooklyn', 'queens', 'statenisland']
    years = range(2003, 2017)

    # Format input arguments appropriately
    try:
        year = int(year)
    except TypeError:
        print("inappropriate year for data")
    if year < 100:
        year = year + 2000
    assert(year in years), "inappropriate year for data"
    if boro == "si":
        boro = "statenisland"
    assert(boro in boros), "inappropriate boro for data"

    # Reads in Excel file skipping appropriate number of junk rows at the
    # beginning of file, keeping the header row as a header
    filename = "{data_dir}/{year}_{boro}.xls".format(data_dir = data_dir,
        year = year, boro = boro)
    skip_rows = 4 if year > 2010 else 3
    data = pd.read_excel(filename, skiprows = skip_rows)
    # Remove newline characters from column headers
    data.columns = [col.strip().lower() for col in data.columns]
    return data

In [6]:
brooklyn_15 = read_in_boro_year_data('brooklyn', '15')
print(brooklyn_15.columns)
print(brooklyn_15.shape)

Index(['borough', 'neighborhood', 'building class category',
       'tax class at present', 'block', 'lot', 'ease-ment',
       'building class at present', 'address', 'apartment number', 'zip code',
       'residential units', 'commercial units', 'total units',
       'land square feet', 'gross square feet', 'year built',
       'tax class at time of sale', 'building class at time of sale',
       'sale price', 'sale date'],
      dtype='object')
(25362, 21)


In [9]:
def add_BBL_and_price_per_ft(data, copy = True):
    """
    Takes a raw dataframe and adds the BBL code (Borough, Block, Lot), and
    price per square foot. Uses same 10-digit BBL format as PLUTO:
    1 digit for Borough, 5 digits for Block, 4 digits for Lot.

    Args:
        Pandas DataFrame data: raw data frame to append the "bbl" and "price
            per sqft" columns to
        boolean copy: whether to make a copy or alter the dataframe in place
    Returns:
        Pandas DataFrame
    """
    # Copy the data frame to a new object if desired
    if copy:
        processed_data = data.copy()
    else:
        processed_data = data

    # Extract the borough, block, and lot, and create a 10-digit code
    # zero-padded code from these three columns in order
    bbl_columns = data[["borough", "block", "lot"]].itertuples()
    bbl_formatted = pd.Series(["%01d%05d%04d" % (row.borough, row.block, row.lot) for row in bbl_columns], 
                              dtype='int64')
    processed_data["bbl"] = bbl_formatted
    processed_data["price per sqft"] = data["sale price"] / data[
        "gross square feet"]
    return processed_data

In [10]:
brooklyn_15 = add_BBL_and_price_per_ft(brooklyn_15)
print(brooklyn_15.columns)

Index(['borough', 'neighborhood', 'building class category',
       'tax class at present', 'block', 'lot', 'ease-ment',
       'building class at present', 'address', 'apartment number', 'zip code',
       'residential units', 'commercial units', 'total units',
       'land square feet', 'gross square feet', 'year built',
       'tax class at time of sale', 'building class at time of sale',
       'sale price', 'sale date', 'bbl', 'price per sqft'],
      dtype='object')


In [11]:
def read_in_pluto(boros, data_dir = "data/nyc_pluto_16v1"):
    """
    Takes a list of boroughs and extracts PLUTO data for each borough,
    appending each subset to create a single data frame for all boroughs.

    Args:
        list(string) boros: list of all the boroughs to pull pluto data for
        string data_dir: a relative path as a string to folder containing the
            PLUTO data for all boroughs
    Returns:
        Pandas DataFrame
    """
    # mapping of how boroughs are referred in PLUTO filenames
    initials = {"manhattan" : "MN", "brooklyn" : "BK", "bronx" : "BX",
        "queens" : "QN", "statenisland" : "SI"}

    # Create an empty dataframe to store data as we iterate
    pluto = pd.DataFrame()
    for borough in boros:
        filename = "{data_dir}/{boro}.csv".format(data_dir = data_dir,
            boro = initials.get(borough))
        data = pd.read_csv(filename, low_memory = False)
        data.columns = [col.strip().lower() for col in data.columns]
        # Append new rows to existing dataframe
        pluto = pluto.append(data)
    return pluto

In [12]:
def read_in_finance(boros, years, data_dir = "data/finance_sales"):
    """
    Takes a list of boroughs and years and extracts finance data for each year,
    appending each subset to create a single data frame for all years/boroughs.

    Args:
        list(string) boros: list of all the boroughs to pull finance data for
        list(int) years: list of all the years to pull finance data for
        string data_dir: a relative path as a string to folder containing the
            department of finance sales price data for all boroughs
    Returns:
        Pandas DataFrame
    """
    # Create an empty dataframe to store data as we iterate
    finance = pd.DataFrame()
    for year in years:
        for borough in boros:
            print("Pulling Finance data for {}_{}".format(year, borough))
            boro_year = read_in_boro_year_data(borough, year, data_dir)
            boro_year = add_BBL_and_price_per_ft(boro_year)
            # Append new rows to existing dataframe
            finance = finance.append(boro_year)
    return finance

In [13]:
def merge_pluto_finance(pluto, finance, boros, years,
    output_dir = "data/merged"):
    """
    Performs an outer join on PLUTO and Dept of Finance data using BBL as the
    join key, returning a single dataframe. Also writes merged output to file.

    Args:
        Pandas DataFrame pluto: contains PLUTO data and "bbl" join key
        Pandas DataFrame finance: contains finance data and "bbl" join key
        list(string) boros: list of boroughs to use in filename of merged data
        list(int) years: list of years to use in filename of merged data
        string output_dir: directory to store merged output data
     Returns:
        Pandas DataFrame
    """
    buildings = pd.merge(pluto, finance, how='right', on='bbl',
                        suffixes=['_pluto', '_finance'])
    output = "{output_dir}/{boros_joined}_{min_year}_{max_year}.csv".format(
        boros_joined = "_".join(boros), min_year = min(years),
        max_year = max(years), output_dir = output_dir)
    buildings.to_csv(output, index = False)
    return buildings

In [14]:
boros = ['bronx']
years = [2010]

In [15]:
pluto = read_in_pluto(boros)

In [16]:
finance = read_in_finance(boros, years)

Pulling Finance data for 2010_bronx


In [17]:
buildings = merge_pluto_finance(pluto, finance, boros, years)

In [18]:
print(sum(buildings.borough_pluto.isnull()))
print(buildings.shape)

639
(4755, 108)


In [19]:
finance[["borough", "block", "lot", "bbl"]].head()

Unnamed: 0,borough,block,lot,bbl
0,2,3030,70,2030300070
1,2,3035,2,2030350002
2,2,3037,42,2030370042
3,2,3039,64,2030390064
4,2,3046,34,2030460034


In [20]:
print(len(pluto.columns))
print(len(finance.columns))
print(len(buildings.columns))

86
23
108


In [None]:
#bmq_2010_2013 = pd.read_csv("data/merged/brooklyn_manhattan_queens_2010_2013.csv")

In [None]:
#bmq_2010_2013.head()