## Data exploration of NYC property assessments public datasets (2009, 2014).
#####Downloaded using enigma.io API, which provides access to free public datasets.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

## Download 2 datasets making use of enigma.io API.

Go to last cell in this notebook, which shows an initial version of a Python client to enigma.io.
To download a dataset, the module containing the client would be invoked like this...

      ENV_ENIGMA_APIKEY=xyz ./enigma_client.py us.states.ny.cities.nyc.property.assessment-valuation.tc1.2009

## What did we just download? File sizes, number of rows...

(Tried to load the csv file into a Pandas DataFrame and it was taking a very long time.)

In [2]:
import os

nyc_props_2009_fname = "nyc_property_assessments/nyc.prop_assess_2009.csv"
nyc_props_2014_fname = "nyc_property_assessments/nyc.prop_assess_2014.csv"
print("File size 2009: {:,}".format(os.stat(nyc_props_2014_fname).st_size))
print("File size 2014: {:,}".format(os.stat(nyc_props_2009_fname).st_size))

print()

num_lines_09 = !wc -l nyc_property_assessments/nyc.prop_assess_2009.csv
num_lines_14 = !wc -l nyc_property_assessments/nyc.prop_assess_2014.csv
print("Num lines 2009: {:,}".format(int(num_lines_09[0].split()[0])))
print("Num lines 2014: {:,}".format(int(num_lines_14[0].split()[0])))



File size 2009: 164,461,696
File size 2014: 453,161,784

Num lines 2009: 705,621
Num lines 2014: 1,088,351


## Quite large (160MB and 450MB).

### Hmmm, importing these as Pandas DataFrames may thrash my humble laptop/VM<br>Let's do some calculations...

### Do 4 tests:<br> Read [100, 500, 1000, 5000] rows of the 2009 table.<br>See if memory consumption increases linearly.<br>Project mem usage for max number of rows.

In [3]:
# choose chunks of lines from different sections of the csv file
# so that they are not always from the beginning 
# (just in case row sizes progressively increase/decrease/change)

# 2009 file: 700k lines
from collections import OrderedDict
rows_base_range = OrderedDict([(1024*10, 100), (1024*100, 500), 
                               (1024*400, 1000), (1024*600, 5000)])

#for kk,vv in rows_base_range.items(): 
#   print(kk, vv)
skip = 1024*10
num = rows_base_range[skip]
df = pd.read_csv(nyc_props_2009_fname, skiprows=skip, nrows=num)



In [4]:
df.shape


(100, 118)

In [5]:
### Let's use the 2009 property assessment files.

In [6]:
# quite large files (0.5Gig, 160M); create a subset to play/experiment with...

# note: zip code values are floats (11231 -> 11231.00)
# create a subset file; from 2009, zip code 11231 (grep will capture a few other "11231" matches)

#!egrep -i "11231.00|fv_chg" nyc_property_assessments/nyc.prop_assess_2009.csv > nyc_property_assessments/nyc.prop_assess_2009__jdsample_11231.csv


In [7]:
# read into dataframe
df09 = pd.read_csv("nyc_property_assessments/nyc.prop_assess_2009__jdsample_11231.csv", parse_dates=True)
df09.head(3)


Unnamed: 0,bble,borough,block,lot,ease,secvol,district,year4,cur_fv_l,cur_fv_t,...,o_applic,reuc,geo_rc,coop_num,ex_inds,ex_count,ex_chgdt,dchgdt,sm_chgdt,serialid
0,1002461042,MANHATTAN,246,1042,,106,33,2007,84162,380678,...,,,0,0,EEE,0,,1994-09-13,2007-08-01,103
1,2029650129,BRONX,2965,129,,1105,88,2007,72000,465000,...,,,0,0,EEEE,1,,2005-11-22,2007-12-27,14304
2,2029650236,BRONX,2965,236,,1105,88,2007,72000,423000,...,,,0,0,EEEE,2,,2007-11-21,2007-12-27,14335


In [8]:

print("Num elems in table: df.size = {}".format(df09.size))

# confirm...
from operator import mul
print("Num elems in table: shape = {}; rows x cols = {}".format(df09.shape, mul(*df09.shape)))

# what are the names of the columns?
def prettify_colnames(cols, cols_per_row=7):
    colsize = 2 + max(len(colname) for colname in cols)
    outstr = ""
    for ix, colname in enumerate(df09.columns):
        outstr += "{nl}{cn:{sz}}".format(
            nl = "" if (ix+1) % cols_per_row else "\n", cn=colname, sz=colsize)
    return outstr
print("\nColumn names...\n\n{}".format(prettify_colnames(df09.columns, 9)))



Num elems in table: df.size = 335002
Num elems in table: shape = (2839, 118); rows x cols = 335002

Column names...

bble        borough     block       lot         ease        secvol      district    year4       
cur_fv_l    cur_fv_t    new_fv_l    new_fv_t    fv_chgdt    curavl      curavt      curexl      curext      
curavl_a    curavt_a    curexl_a    curext_a    chgdt       tn_avl      tn_avt      tn_exl      tn_ext      
tn_avl_a    tn_avt_a    tn_exl_a    tn_ext_a    fchgdt      fn_avl      fn_avt      fn_exl      fn_ext      
fn_avl_a    fn_avt_a    fn_exl_a    fn_ext_a    txcl        o_txcl      cbn_txcl    bldgcl      exmtcl      
owner       hnum_lo     hnum_hi     str_name    zip         tot_unit    res_unit    lfrt_dec    ldep_dec    
l_acre      irreg       bfrt_dec    bdep_dec    bld_var     ext         story       bldgs       corner      
lnd_area    gr_sqft     zoning      yrb         yrb_flag    yrb_rng     yra1        yra1_rng    yra2        
yra2_rng    cp_boro    

In [9]:
# Even better...
df09.describe()



Unnamed: 0,bble,block,lot,ease,secvol,district,year4,cur_fv_l,cur_fv_t,new_fv_l,...,protest2,at_grp2,applic2,o_at_grp,o_applic,reuc,geo_rc,coop_num,ex_count,serialid
count,2839.0,2839.0,2839.0,0.0,2839.0,2839.0,2839,2839.0,2839.0,2839.0,...,0.0,2839,0.0,2839.0,7,0.0,2839.0,2839,2839.0,2839.0
mean,3019658000.0,592.012681,146.926383,,276.144417,80.958436,2007,203993.901726,1021630.684396,274050.754843,...,,0,,0.065516,1,,0.213455,0,0.715041,86165.313138
std,164334900.0,1290.013133,315.872292,,544.967603,19.402247,0,173379.265394,599821.186848,196658.898501,...,,0,,2.854087,0,,1.740235,0,0.663258,54357.896942
min,1002461000.0,246.0,1.0,,106.0,25.0,2007,0.0,0.0,0.0,...,,0,,0.0,1,,0.0,0,0.0,103.0
25%,3003471000.0,348.0,18.0,,202.0,88.0,2007,94700.0,546344.0,151000.0,...,,0,,0.0,1,,0.0,0,0.0,78114.5
50%,3004140000.0,414.0,36.0,,203.0,88.0,2007,162000.0,909000.0,243000.0,...,,0,,0.0,1,,0.0,0,1.0,79387.0
75%,3004600000.0,460.0,54.0,,204.0,88.0,2007,248000.0,1392000.0,360000.0,...,,0,,0.0,1,,0.0,0,1.0,80251.5
max,5015481000.0,15773.0,1103.0,,6003.0,89.0,2007,1980000.0,3534000.0,2970000.0,...,,0,,147.0,1,,42.0,0,5.0,619429.0


In [10]:
# let's explore the zip column...
print("\n\n########## Zip code section")

print("\n")
uniques = set(df09['zip'])
print("set of zips: {}".format(uniques))

# groupby zip...
print("\n")
print("groupby...".format(set(df09['zip'])))
grp_zip = df09.groupby("zip")
for key,grp in grp_zip:
    print("{:>8}   {:>4}".format(key, len(grp)))

# let's create a new dataframe with just zip==11231
print("\nlet's create a new dataframe with just zip==11231...")
df11231 = df09[df09["zip"] == 11231]
# confirm...
assert len(set(df11231['zip'])) == 1
print("df11231.shape={} (reminder df09.shape={})".format(df11231.shape, df09.shape))




########## Zip code section


set of zips: {0.0, 10466.0, 11235.0, 11106.0, 11364.0, 11429.0, 11368.0, 11434.0, 11691.0, 11212.0, 10314.0, 11375.0, 10002.0, 11221.0, 11413.0, 11414.0, 10459.0, 11420.0, 11230.0, 11231.0}


groupby...
     0.0      2
 10002.0      1
 10314.0      4
 10459.0      6
 10466.0      1
 11106.0      1
 11212.0     15
 11221.0      2
 11230.0      1
 11231.0   2766
 11235.0      1
 11364.0      1
 11368.0      2
 11375.0      1
 11413.0      1
 11414.0      1
 11420.0      1
 11429.0     30
 11434.0      1
 11691.0      1

let's create a new dataframe with just zip==11231...
df11231.shape=(2766, 118) (reminder df09.shape=(2839, 118))


In [11]:
df11231.head(3)


Unnamed: 0,bble,borough,block,lot,ease,secvol,district,year4,cur_fv_l,cur_fv_t,...,o_applic,reuc,geo_rc,coop_num,ex_inds,ex_count,ex_chgdt,dchgdt,sm_chgdt,serialid
9,3005870050,BROOKLYN,587,50,,207,86,2007,115000,115000,...,,,12,0,,0,,1992-12-07,2007-07-02,76502
10,3003190074,BROOKLYN,319,74,,201,88,2007,0,0,...,,,16,0,,0,,2008-05-17,2008-05-16,76749
11,3003550115,BROOKLYN,355,115,,202,86,2007,19700,19700,...,,,12,0,,0,,1992-12-07,2007-08-01,76802


In [12]:
# display all columns...
pd.set_option('display.max_columns', 120)
df11231.head(3)

Unnamed: 0,bble,borough,block,lot,ease,secvol,district,year4,cur_fv_l,cur_fv_t,new_fv_l,new_fv_t,fv_chgdt,curavl,curavt,curexl,curext,curavl_a,curavt_a,curexl_a,curext_a,chgdt,tn_avl,tn_avt,tn_exl,tn_ext,tn_avl_a,tn_avt_a,tn_exl_a,tn_ext_a,fchgdt,fn_avl,fn_avt,fn_exl,fn_ext,fn_avl_a,fn_avt_a,fn_exl_a,fn_ext_a,txcl,o_txcl,cbn_txcl,bldgcl,exmtcl,owner,hnum_lo,hnum_hi,str_name,zip,tot_unit,res_unit,lfrt_dec,ldep_dec,l_acre,irreg,bfrt_dec,bdep_dec,bld_var,ext,story,bldgs,corner,lnd_area,gr_sqft,zoning,yrb,yrb_flag,yrb_rng,yra1,yra1_rng,yra2,yra2_rng,cp_boro,cp_dist,limitation,o_limit,status1,status2,newlot,droplot,delchg,corchg,nodesc,noav,valref,mbldg,condo_nm,condo_s1,condo_s2,condo_s3,condo_a,comint_l,comint_b,aptno,ap_boro,ap_block,ap_lot,ap_ease,ap_date,ap_time,protest,at_grp,applic,protest2,at_grp2,applic2,o_protst,o_at_grp,o_applic,reuc,geo_rc,coop_num,ex_inds,ex_count,ex_chgdt,dchgdt,sm_chgdt,serialid
9,3005870050,BROOKLYN,587,50,,207,86,2007,115000,115000,173000,173000,2007-12-22,1773,1773,0,0,1773,1773,0,0,2007-07-02,1879,1879,0,0,1879,1879,0,0,,1879,1879,0,0,1879,1879,0,0,1B,1B,1B,V3,,RED HOOK BUILDING COM,,,COFFEY STREET,11231,0,0,22.0,100.0,,,0.0,0.0,,,0,0,,2200,0,R5,0,,0,0,0,0,0,3.0,6.0,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,,,,,0,0,,0,0,0,,,0,,0,,,0,,,0,,,12,0,,0,,1992-12-07,2007-07-02,76502
10,3003190074,BROOKLYN,319,74,,201,88,2007,0,0,50200,764200,2008-05-16,0,0,0,0,0,0,0,0,2008-05-16,3012,45852,0,0,3012,45852,0,0,,3012,45852,0,0,3012,45852,0,0,1,,1,A5,,,00000000129C,00000000129C,DE GRAW STREET,11231,1,1,20.25,24.75,,I,20.25,23.81,,,3,1,,502,1447,NZS,1931,E,0,2006,0,2006,0,,,,,1,0,0,0,0,0,0,0,0,0,0,,,,,0,0,,3,319,72,,2008-05-16,120103,,0,,,0,,,0,,,16,0,,0,,2008-05-17,2008-05-16,76749
11,3003550115,BROOKLYN,355,115,,202,86,2007,19700,19700,19700,19700,,36,36,0,0,36,36,0,0,2007-08-01,37,37,0,0,37,37,0,0,,37,37,0,0,37,37,0,0,1B,1B,1B,V0,,DCAS,,,CARROLL STREET,11231,0,0,20.0,10.0,,,0.0,0.0,,,0,0,,200,0,R6,0,,0,0,0,0,0,3.0,6.0,1.0,1.0,1,0,0,0,0,0,0,0,0,0,0,,,,,0,0,,0,0,0,,,0,,0,,,0,,,0,,,12,0,,0,,1992-12-07,2007-08-01,76802


In [13]:
df11231.cur_fv_l.max()
df11231.cur_fv_t.max()


3534000.0

In [14]:
# df11231[df11231.owner == "IRVING SEALEY"]
df11231[df11231.str_name == "3 STREET"]

some_cols = ("cur_fv_t", "new_fv_t", "hnum_lo", "owner", "str_name", "tot_unit", 
             "res_unit", "lfrt_dec", "ldep_dec", "bfrt_dec", "bdep_dec", "story", 
             "bldgs", "lnd_area", "gr_sqft", "yrb", )

low_val = df11231[df11231.str_name == "3 STREET"].ix[:, some_cols].sort("cur_fv_t")[:5]
top_val = df11231[df11231.str_name == "3 STREET"].ix[:, some_cols].sort("cur_fv_t")[-5:]
pd.concat([low_val, top_val])




Unnamed: 0,cur_fv_t,new_fv_t,hnum_lo,owner,str_name,tot_unit,res_unit,lfrt_dec,ldep_dec,bfrt_dec,bdep_dec,story,bldgs,lnd_area,gr_sqft,yrb
2208,161000,156000,71,BEIRNE CHRISTOPHER,3 STREET,1,1,20.0,90.0,20.0,40,3,1,483,2400,1900
2234,644480,627000,48,BROWN NORMAN E,3 STREET,2,2,20.0,100.0,20.0,36,2,1,2000,2016,1899
2238,645810,364000,56,GONZALEZ ANA A,3 STREET,2,2,20.0,81.33,20.0,36,2,1,1627,2016,1899
2201,679000,780000,97,CONCETTA RUSSO,3 STREET,2,2,15.0,90.0,15.0,45,3,1,1350,2025,1899
2205,686000,944000,89,"MAGGIORE, DONNA",3 STREET,2,2,20.0,90.0,20.0,40,2,1,1800,2400,1899
2163,1350000,1272000,9,"MAZZEO, MICHAEL A",3 STREET,3,3,20.75,80.0,20.75,45,3,1,1660,4240,1901
2161,1387000,1261000,15,JEFFREY MONTE,3 STREET,3,3,20.0,80.0,20.0,38,3,1,1600,3040,1901
2160,1399000,1270000,17,L AHEARNE,3 STREET,3,3,20.0,80.0,20.0,38,3,1,1600,3104,1901
2228,1403000,1444000,14,"LUMB, JEAN MARIE",3 STREET,3,3,19.0,100.0,19.0,36,3,1,1900,3240,1901
2164,1466000,1769000,7,YVONNE BUSUTTIL,3 STREET,3,3,19.92,95.0,19.92,45,3,1,1790,3740,1910


# python client for enigma.io API - for now it just provides "export" support

In [None]:
#!/usr/bin/env python

import os
import sys
import time
import argparse
import requests
from requests import RequestException

ARGPARSER = argparse.ArgumentParser()
ARGPARSER.add_argument("datapath")
ARGPARSER.add_argument("-o", "--outfname", dest="outfname", required=False)


class EnigmaClient:
    ENIG_RETCODE_SUCCESS = 0
    ENIG_RETCODE_TIMEOUT = 1
    ENIG_RETCODE_ERROR   = 2

    _QUERY_TEMPLATE = "https://api.enigma.io/v2/{op}/{key}/{datapath}/{params}"
    _EXPORTAPI_MAX_SECS_TIMEOUT = 120
    _EXPORTAPI_POLL_SECS_INTERVAL = 1
    _EXPORTAPI_CHUNK_SIZE = 1024 * 64

    def __init__(self, apikey):
        self.apikey = apikey


    # Tables are exported as Gzipped CSV files. 
    # 
    # Exports of large tables may take some time, so exports are processed
    # asynchronously.
    # 
    # When the export API is called, an export is queued and the API immediately
    # returns a URL pointing to the future location of the exported file. Users
    # should poll the URL until the file becomes available.
    # 
    # Note: this is a synchronous call. Caller waits (up to max_secs_timeout)
    #       until data is made available by Enigma backend, and then downloaded.
    #       Caller should check err_code if the export fails.
    # 
    # Returns one of: ENIG_RETCODE_SUCCESS, ENIG_RETCODE_TIMEOUT, ENIG_RETCODE_ERROR
    # 
    def export(self, datapath, parameters=None,
               max_secs_timeout=None, outfname=None):

        query_str = self._QUERY_TEMPLATE.format(op="export", key=self.apikey,
                                                datapath=datapath, params="")
        # make a query, which sends back the URL that we'll have to poll
        # until it eventually responds back with the gzipped csv file.
        resp = requests.get(query_str)
        resp_json = resp.json()
        export_url = resp_json["export_url"]

        if not outfname:
            # derive it from the URL
            outfname = export_url[export_url.strip("/").rfind("/")+1:]
            if "?" in outfname:
                outfname = outfname[:outfname.rfind("?")]

        return self._download_file(export_url, outfname)
        
    def _download_file(self, export_url, outfname):
        timeout_secs_remaining = self._EXPORTAPI_MAX_SECS_TIMEOUT
        poll_time_interval = self._EXPORTAPI_POLL_SECS_INTERVAL

        completed = False
        print("About to attempt exporting export URL: {}".format(export_url))
        while (not completed) and (timeout_secs_remaining > 0):
            print("Retries timeout remaining: {} secs".format(timeout_secs_remaining))
            try:
                resp = requests.get(export_url, stream=True)
                if resp.ok is True:
                    with open(outfname, 'wb') as fout:
                        for chunk in resp.iter_content(
                                chunk_size=self._EXPORTAPI_CHUNK_SIZE): 
                            if chunk: # filter out keep-alive new chunks
                                fout.write(chunk)
                                fout.flush()
                    completed = True
                else:
                    print("Still polling. Response status_code: {} - Reason: {}".format(
                        resp.status_code, resp.reason))
            except Exception as exc:
                print("exc: {}".format(exc))
            finally:
                if not completed:
                    time.sleep(poll_time_interval)
                    timeout_secs_remaining -= poll_time_interval

        if completed is True:
            return self.ENIG_RETCODE_SUCCESS
        elif timeout_secs_remaining < 0:
            return self.ENIG_RETCODE_TIMEOUT
        else:
            return self.ENIG_RETCODE_ERROR

        
def main(apikey, args):
    # just getting started. For now there's only functionality to "export".
    client = EnigmaClient(apikey)
    client.export(args.datapath, outfname=args.outfname)


if __name__ == "__main__":
    apikey = os.getenv("ENV_ENIGMA_APIKEY")
    if not apikey:
        print('Please set your env var "ENV_ENIGMA_APIKEY"')
        sys.exit(1)
    
    main(apikey, ARGPARSER.parse_args())
