In [1]:
from io import BytesIO
from zipfile import ZipFile
import requests

In [2]:
REAL_PROPERTY_SALES_URL = "https://aqua.kingcounty.gov/extranet/assessor/Real%20Property%20Sales.zip"
RESIDENTIAL_BUILDING_URL = "https://aqua.kingcounty.gov/extranet/assessor/Residential%20Building.zip"
PARCEL_URL = "https://aqua.kingcounty.gov/extranet/assessor/Parcel.zip"

In [5]:
# parcels has low to medium number of rows according to the doc, so let's try it first
parcel_response = requests.get(PARCEL_URL)

In [6]:
# okay, that took about 10 seconds, so far so good
parcel_response.content[:100]

b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xab\x89vO\xa6\xa6\xb0\xc2\xa7Z\xd8\x01\xd0\xfak\x0e\x0f\x00\x00\x00EXTR_Parcel.csv\xec]\xdds\xe28\xb6\x7f\xbfU\xf7\x7fp\xf9\xa1\xefn\xd50+\xc9\xdf\x8f\x0e8\xc1\x1dcgl\x92l\xe6\x8dI\x9cnv\x08d\x81\xccl\xef_\x7fu\x8e,\xc96\x98@OO'

In [7]:
# looks like bytes to me!
parcel_file = BytesIO(parcel_response.content)

In [8]:
parcel_zipfile = ZipFile(parcel_file)

In [9]:
# what files are in this zip?
parcel_zipfile.filelist

[<ZipInfo filename='EXTR_Parcel.csv' compress_type=deflate external_attr=0x20 file_size=241957584 compress_size=30956199>]

In [10]:
parcel_csv = parcel_zipfile.open('EXTR_Parcel.csv')

In [11]:
parcel_csv.peek()

b'"Major","Minor","PropName","PlatName","PlatLot","PlatBlock","Range","Township","Section","QuarterSection","PropType","Area","SubArea","SpecArea","SpecSubArea","DistrictName","LevyCode","CurrentZoning","HBUAsIfVacant","HBUAsImproved","PresentUse","SqFtLot","WaterSystem","SewerSystem","Access","Topography","StreetSurface","RestrictiveSzShape","InadequateParking","PcntUnusable","Unbuildable","MtRainier","Olympics","Cascades","Territorial","SeattleSkyline","PugetSound","LakeWashington","LakeSammamish","SmallLak'

In [12]:
# :happydance: that looks like the first row of a CSV that matches the doc!

In [14]:
from csv import DictReader

In [15]:
parcel_reader = DictReader(parcel_csv)

In [16]:
parcel_reader.fieldnames

Error: iterator should return strings, not bytes (did you open the file in text mode?)

In [17]:
# ok, it's not happy about the format
parcel_csv_decoded = parcel_zipfile.open('EXTR_Parcel.csv', "rt")

ValueError: open() requires mode "r" or "w"

In [21]:
# huh, zipfile "open" doesn't accept a flag to tell it to open as text.
# maybe I can back up and make this a TextIO instead of BytesIO?
from io import TextIOWrapper
parcel_text_file = TextIOWrapper(parcel_response.content)

AttributeError: 'bytes' object has no attribute 'readable'

In [23]:
# no, trying one other technique of opening
parcel_text_file = TextIOWrapper(parcel_file)

In [24]:
parcel_text_zipfile = ZipFile(parcel_text_file)

BadZipFile: File is not a zip file

In [25]:
parcel_text_file?

[0;31mType:[0m        TextIOWrapper
[0;31mString form:[0m <_io.TextIOWrapper encoding='UTF-8'>
[0;31mDocstring:[0m  
Character and line based layer over a BufferedIOBase object, buffer.

encoding gives the name of the encoding that the stream will be
decoded or encoded with. It defaults to locale.getpreferredencoding(False).

errors determines the strictness of encoding and decoding (see
help(codecs.Codec) or the documentation for codecs.register) and
defaults to "strict".

newline controls how line endings are handled. It can be None, '',
'\n', '\r', and '\r\n'.  It works as follows:

* On input, if newline is None, universal newlines mode is
  enabled. Lines in the input can end in '\n', '\r', or '\r\n', and
  these are translated into '\n' before being returned to the
  caller. If it is '', universal newline mode is enabled, but line
  endings are returned to the caller untranslated. If it has any of
  the other legal values, input lines are only terminated by the given
  stri

In [26]:
parcel_csv_text_file = TextIOWrapper(parcel_csv)

In [27]:
parcel_csv_text_file?

[0;31mType:[0m        TextIOWrapper
[0;31mString form:[0m <_io.TextIOWrapper name='EXTR_Parcel.csv' encoding='UTF-8'>
[0;31mDocstring:[0m  
Character and line based layer over a BufferedIOBase object, buffer.

encoding gives the name of the encoding that the stream will be
decoded or encoded with. It defaults to locale.getpreferredencoding(False).

errors determines the strictness of encoding and decoding (see
help(codecs.Codec) or the documentation for codecs.register) and
defaults to "strict".

newline controls how line endings are handled. It can be None, '',
'\n', '\r', and '\r\n'.  It works as follows:

* On input, if newline is None, universal newlines mode is
  enabled. Lines in the input can end in '\n', '\r', or '\r\n', and
  these are translated into '\n' before being returned to the
  caller. If it is '', universal newline mode is enabled, but line
  endings are returned to the caller untranslated. If it has any of
  the other legal values, input lines are only termina

In [28]:
parcel_text_reader = DictReader(parcel_csv_text_file)

In [29]:
parcel_text_reader.fieldnames

['245000',
 '0086',
 ' ',
 'FAILORS ENATAI GARDENS ADD',
 '17            ',
 '1      ',
 '0',
 '0',
 '0',
 '  ',
 'X',
 '0',
 '0',
 '',
 '',
 'BELLEVUE',
 '0330',
 ' ',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 'False',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 'N',
 'N',
 '0',
 '0',
 '0',
 '0',
 'N',
 'N',
 '0',
 '0',
 'N',
 'N',
 'N',
 '0',
 '0',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N']

In [30]:
# uhh, that's weird.  gonna start over now just in case the file buffer got messed up

In [31]:
from io import BytesIO, TextIOWrapper
from zipfile import ZipFile
import requests
from csv import DictReader

In [32]:
PARCEL_URL = "https://aqua.kingcounty.gov/extranet/assessor/Parcel.zip"
parcel_response = requests.get(PARCEL_URL)

In [33]:
parcel_file = BytesIO(parcel_response.content)
parcel_zipfile = ZipFile(parcel_file)

In [34]:
parcel_csv_bytes = parcel_zipfile.open('EXTR_Parcel.csv')

In [35]:
parcel_csv_text = TextIOWrapper(parcel_csv_bytes)

In [36]:
parcel_reader = DictReader(parcel_csv_text)

In [37]:
parcel_reader.fieldnames

['Major',
 'Minor',
 'PropName',
 'PlatName',
 'PlatLot',
 'PlatBlock',
 'Range',
 'Township',
 'Section',
 'QuarterSection',
 'PropType',
 'Area',
 'SubArea',
 'SpecArea',
 'SpecSubArea',
 'DistrictName',
 'LevyCode',
 'CurrentZoning',
 'HBUAsIfVacant',
 'HBUAsImproved',
 'PresentUse',
 'SqFtLot',
 'WaterSystem',
 'SewerSystem',
 'Access',
 'Topography',
 'StreetSurface',
 'RestrictiveSzShape',
 'InadequateParking',
 'PcntUnusable',
 'Unbuildable',
 'MtRainier',
 'Olympics',
 'Cascades',
 'Territorial',
 'SeattleSkyline',
 'PugetSound',
 'LakeWashington',
 'LakeSammamish',
 'SmallLakeRiverCreek',
 'OtherView',
 'WfntLocation',
 'WfntFootage',
 'WfntBank',
 'WfntPoorQuality',
 'WfntRestrictedAccess',
 'WfntAccessRights',
 'WfntProximityInfluence',
 'TidelandShoreland',
 'LotDepthFactor',
 'TrafficNoise',
 'AirportNoise',
 'PowerLines',
 'OtherNuisances',
 'NbrBldgSites',
 'Contamination',
 'DNRLease',
 'AdjacentGolfFairway',
 'AdjacentGreenbelt',
 'HistoricSite',
 'CurrentUseDesignatio

In [38]:
# okay, it was the file buffer!!

In [39]:
# seems like a good time to move this into a .py file

In [40]:
parcel_csv_bytes.close()

In [41]:
parcel_reader.dialect

'excel'

In [42]:
counter = 0
for row in parcel_reader:
    print(row['Major'], row['Topography'])
    counter += 1
    if counter > 10:
        break

ValueError: I/O operation on closed file.

In [43]:
sales_response = requests.get(REAL_PROPERTY_SALES_URL)

In [44]:
sales_file = BytesIO(sales_response.content)

In [45]:
sales_zipfile = ZipFile(sales_file)

In [46]:
sales_zipfile.filelist

[<ZipInfo filename='EXTR_RPSale.csv' compress_type=deflate external_attr=0x20 file_size=545190382 compress_size=126235452>]

In [47]:
buildings_response = requests.get(RESIDENTIAL_BUILDING_URL)

In [48]:
buildings_file = BytesIO(buildings_response.content)

In [51]:
buildings_zipfile = ZipFile(buildings_file)

In [52]:
buildings_zipfile.filelist

[<ZipInfo filename='EXTR_ResBldg.csv' compress_type=deflate external_attr=0x20 file_size=149110806 compress_size=24643484>]

In [53]:
# wrapping up
from data_collection import collect_parcels_data, collect_sales_data, collect_building_data

In [54]:
parcels_zip_file, parcels_csv_file = collect_parcels_data()
sales_zip_file, sales_csv_file = collect_sales_data()
buildings_zip_file, buildings_csv_file = collect_building_data()

In [55]:
parcels_reader = DictReader(parcels_csv_file)
counter = 0
for row in parcels_reader:
    print(row["Major"], row["Topography"])
    if counter > 10:
        break
    else:
        counter += 1

052307 0
245000 0
377920 0
277160 0
766620 0
052613 0
014800 0
291970 0
092613 0
092406 0
142210 0
172613 0


In [56]:
sales_reader = DictReader(sales_csv_file)
counter = 0
for row in sales_reader:
    print(row["Major"], row["Sale Price"])
    if counter > 10:
        break
    else:
        counter += 1

KeyError: 'Sale Price'

In [57]:
sales_reader.fieldnames

['ExciseTaxNbr',
 'Major',
 'Minor',
 'DocumentDate',
 'SalePrice',
 'RecordingNbr',
 'Volume',
 'Page',
 'PlatNbr',
 'PlatType',
 'PlatLot',
 'PlatBlock',
 'SellerName',
 'BuyerName',
 'PropertyType',
 'PrincipalUse',
 'SaleInstrument',
 'AFForestLand',
 'AFCurrentUseLand',
 'AFNonProfitUse',
 'AFHistoricProperty',
 'SaleReason',
 'PropertyClass',

In [58]:
sales_reader = DictReader(sales_csv_file)
counter = 0
for row in sales_reader:
    print(row["Major"], row["SalePrice"])
    if counter > 10:
        break
    else:
        counter += 1

KeyError: 'Major'

In [59]:
sales_zip_file, sales_csv_file = collect_sales_data()

In [60]:
sales_reader = DictReader(sales_csv_file)
counter = 0
for row in sales_reader:
    print(row["Major"], row["SalePrice"])
    if counter > 10:
        break
    else:
        counter += 1

330405 215000
868146 0
258190 0
919715 192000
334330 0
663990 690576
032103 2340000
032103 2340000
032103 2340000
032103 2340000
032103 2340000
032103 2340000


In [61]:
buildings_reader = DictReader(buildings_csv_file)

In [62]:
buildings_reader.fieldnames

['Major',
 'Minor',
 'BldgNbr',
 'NbrLivingUnits',
 'Address',
 'BuildingNumber',
 'Fraction',
 'DirectionPrefix',
 'StreetName',
 'StreetType',
 'DirectionSuffix',
 'ZipCode',
 'Stories',
 'BldgGrade',
 'BldgGradeVar',
 'SqFt1stFloor',
 'SqFtHalfFloor',
 'SqFt2ndFloor',
 'SqFtUpperFloor',
 'SqFtUnfinFull',
 'SqFtUnfinHalf',
 'SqFtTotLiving',
 'SqFtTotBasement',
 'SqFtFinBasement',
 'FinBasementGrade',
 'SqFtGarageBasement',
 'SqFtGarageAttached',
 'DaylightBasement',
 'SqFtOpenPorch',
 'SqFtEnclosedPorch',
 'SqFtDeck',
 'HeatSystem',
 'HeatSource',
 'BrickStone',
 'ViewUtilization',
 'Bedrooms',
 'BathHalfCount',
 'Bath3qtrCount',
 'BathFullCount',
 'FpSingleStory',
 'FpMultiStory',
 'FpFreestanding',
 'FpAdditional',
 'YrBuilt',
 'YrRenovated',
 'PcntComplete',
 'Obsolescence',
 'PcntNetCondition',
 'Condition',
 'AddnlCost']

In [63]:
counter = 0
for row in buildings_reader:
    print(row["Major"], row["ZipCode"])
    if counter > 10:
        break
    else:
        counter += 1

012000 98033
012000 98033
012000 98033
012005 98092
012006 98022
012006 98022
012006 98022
012006 98022
012100 98075
012102 98070
012103 98023
012103 98023
