In [1]:
!ls data/finance_sales

2003_bronx.xls        2007_statenisland.xls 2012_queens.xls
2003_brooklyn.xls     2008_bronx.xls        2012_statenisland.xls
2003_manhattan.xls    2008_brooklyn.xls     2013_bronx.xls
2003_queens.xls       2008_manhattan.xls    2013_brooklyn.xls
2003_statenisland.xls 2008_queens.xls       2013_manhattan.xls
2004_bronx.xls        2008_statenisland.xls 2013_queens.xls
2004_brooklyn.xls     2009_bronx.xls        2013_statenisland.xls
2004_manhattan.xls    2009_brooklyn.xls     2014_bronx.xls
2004_queens.xls       2009_manhattan.xls    2014_brooklyn.xls
2004_statenisland.xls 2009_queens.xls       2014_manhattan.xls
2005_bronx.xls        2009_statenisland.xls 2014_queens.xls
2005_brooklyn.xls     2010_bronx.xls        2014_statenisland.xls
2005_manhattan.xls    2010_brooklyn.xls     2015_bronx.xls
2005_queens.xls       2010_manhattan.xls    2015_brooklyn.xls
2005_statenisland.xls 2010_queens.xls       2015_manhattan.xls
2006_bronx.xls        2010_statenisland.xls 2015_queens

In [2]:
import platform
print(platform.python_version())

3.5.2


In [3]:
import pandas as pd
import numpy as np

In [4]:
boros = ['manhattan', 'bronx', 'brooklyn', 'queens', 'statenisland']
years = range(2003, 2017)
boro_codes = dict(zip(boros, range(1,6)))

In [5]:
def read_in_boro_year_data(boro, year):
    """
    Fetches data file for a specified boro and year, and returns the data as a Pandas dataframe.
    
    Args:
        string boro: name of boro for desired data
        int year: year of desired data
    Returns:
        Pandas dataframe
    """
    # Format input arguments appropriately
    try:
        year = int(year)
    except TypeError:
        print("inappropriate year for data")
    if year < 100:
        year = year + 2000
    assert(year in years), "inappropriate year for data"
    if boro == "si":
        boro = "statenisland"
    assert(boro in boros), "inappropriate boro for data"
    
    # Reads in Excel file skipping appropriate number of junk rows at the beginning
    filename = 'data/finance_sales/{year}_{boro}.xls'.format(year = year, boro = boro)
    skip_rows = 4 if year > 2010 else 3
    data = pd.read_excel(filename, skiprows = skip_rows)
    # Remove newline characters from column headers
    data.columns = [col.strip().lower() for col in data.columns]
    return data

In [6]:
brooklyn_15 = read_in_boro_year_data('brooklyn', '15')
print(brooklyn_15.columns)
print(brooklyn_15.shape)

Index(['borough', 'neighborhood', 'building class category',
       'tax class at present', 'block', 'lot', 'ease-ment',
       'building class at present', 'address', 'apartment number', 'zip code',
       'residential units', 'commercial units', 'total units',
       'land square feet', 'gross square feet', 'year built',
       'tax class at time of sale', 'building class at time of sale',
       'sale price', 'sale date'],
      dtype='object')
(25362, 21)


In [7]:
def add_BBL_and_price_per_ft(data, copy = True):
    """
    Takes a raw dataframe and adds the BBL code (Borough, Block, Lot), and price per square foot.
    Uses same 10-digit BBL format as PLUTO: 1 digit for Borough, 5 digits for Block, 4 digits for Lot.
    
    Args:
        Pandas data: raw data frame to append BBL and PRICESQFT columns
        boolean copy: whether to make a copy or alter the dataframe in place
    Returns:
        Pandas dataframe
    """
    # copy the data frame to a new object if desired
    if copy:
        processed_data = data.copy()
    else:
        processed_data = data
    
    # extract the borough, block, and lot, and create a 10-digit code zero-padded code from these
    bbl_columns = data[["borough", "block", "lot"]].itertuples()
    bbl_formatted = ["%01d%05d%04d" % (row.borough, row.block, row.lot) for row in bbl_columns]
    processed_data["bbl"] = bbl_formatted
    processed_data["price per sqft"] = data["sale price"] / data["gross square feet"]
    return processed_data

In [8]:
brooklyn_15 = add_BBL_and_price_per_ft(brooklyn_15)
print(brooklyn_15.columns)

Index(['borough', 'neighborhood', 'building class category',
       'tax class at present', 'block', 'lot', 'ease-ment',
       'building class at present', 'address', 'apartment number', 'zip code',
       'residential units', 'commercial units', 'total units',
       'land square feet', 'gross square feet', 'year built',
       'tax class at time of sale', 'building class at time of sale',
       'sale price', 'sale date', 'bbl', 'price per sqft'],
      dtype='object')


In [10]:
ys = [2014, 2015]
bs = ["brooklyn", "manhattan"]

In [None]:
finance = pd.DataFrame()
for year in ys:
    for borough in bs:
        boro_year = read_in_boro_year_data(borough, year)
        boro_year = add_BBL_and_price_per_ft(boro_year)
        # append new rows to existing dataframe
        finance = finance.append(boro_year)

In [11]:
initials = {"manhattan" : "MN", "brooklyn" : "BK", "bronx" : "BX",
        "queens" : "QN", "statenisland" : "SI"}
initials.get("manhattan")

'MN'

In [13]:
print("Getting pluto data for {}".format(bs))

Getting pluto data for ['brooklyn', 'manhattan']
