# Design the database based on the data
* #### This notebook parses metadata associated with some of the datasets, most especially the PLUTO dataset, which contains columns that are also in many other datasets I looked at on NYCOpenData.
* #### In some cases I had to search around to find more complete definitions than were included in the data dictionary associated with the dataset.

In [5]:
import pdfplumber
import pandas as pd
import time
import dill
time.sleep(5) # This seems to be necessary to avoid an error about dill not being loaded in the next cell. Sometimes even that is not enough and this cell needs to be run again.
from bisect import bisect_left
from itertools import tee
from src.models import ColCustomization
from src.helpers import *
from src.pdfutils import *
from src.dbutils import *
from geoalchemy2 import Geometry
from sqlalchemy import String

* Load the objects created in the previous notebook

In [6]:
# Load the environment
with open("environment_data/select.pkl", "rb") as f:
    env = dill.load(f)

# Restore the environment
globals().update(env)

In [7]:
dataset_info_dict

{'mapPLUTO': DatasetInfo(name='Primary Land Use Tax Lot Output - Map (MapPLUTO)', short_name='mapPLUTO', format='zip', id='f888-ni5f', main_url='https://data.cityofnewyork.us/City-Government/Primary-Land-Use-Tax-Lot-Output-Map-MapPLUTO-/f888-ni5f/about_data', metadata_url='https://data.cityofnewyork.us/api/views/f888-ni5f.json', data_url='https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/mappluto/nyc_mappluto_25v1_1_fgdb.zip', dataset_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/MapPLUTO25v1_1.gdb', data_dict_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf', standard=False, geodata=True, metadata={}, column_metadata={}, cardinality_ratios={}, data_dict_url='https://data.cityofnewyork.us/api/views/f888-ni5f/files/a5f455ae-002e-4e78-ae17-f3dcc59c236d?download=true&filename=PLUTODD22v3.pdf', other_files=[('97ca6e86-32cc-46f1-b85c-8d01e17e1602', 'PlutoReadme22v3.pdf')], attribution='Department of City 

In [8]:
column_types = dataset_info_dict['assessments'].col_types
print(column_types)
print(len(column_types))

{'sid': 'meta_data', 'created_at': 'meta_data', 'updated_at': 'meta_data', 'bble': 'text', 'borough_code': 'number', 'block_number': 'number', 'lot_number': 'number', 'easement': 'text', 'owner_name': 'text', 'building_class': 'text', 'tax_class_code': 'text', 'lot_front': 'number', 'lot_depth': 'number', 'ext': 'text', 'number_of_floors': 'number', 'fullval': 'number', 'avland': 'number', 'avtot': 'number', 'exland': 'number', 'exempttot': 'number', 'excd1': 'number', 'address': 'text', 'zip_code': 'number', 'exmptcl': 'text', 'bldfront': 'number', 'blddepth': 'number', 'avland2': 'number', 'avtot2': 'number', 'exland2': 'number', 'extot2': 'number', 'excd2': 'number', 'period': 'text', 'year': 'text', 'valtype': 'text', 'borough': 'text', 'latitude': 'number', 'longitude': 'number', 'community_board': 'number', 'council_district': 'number', 'census_tract': 'number', 'bin': 'number', 'nta': 'text', 'new_georeferenced_column': 'point'}
43


In [9]:
for dataset_info in dataset_info_dict.values():
    # column_types = dataset_info.col_types
    dataset_info.col_types = set_dtypes(filename=dataset_info.dataset_path, column_types = dataset_info.col_types)

Starting /mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/MapPLUTO25v1_1.gdb
/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/MapPLUTO25v1_1.gdb is not a json file, skipping for now...
Starting /mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/assessments_rows.json
Initial new_column_types: {'sid': <class 'sqlalchemy.sql.sqltypes.String'>, 'created_at': <class 'sqlalchemy.sql.sqltypes.Integer'>, 'updated_at': <class 'sqlalchemy.sql.sqltypes.Integer'>, 'bble': <class 'sqlalchemy.sql.sqltypes.String'>, 'borough_code': <class 'sqlalchemy.sql.sqltypes.Integer'>, 'block_number': <class 'sqlalchemy.sql.sqltypes.Integer'>, 'lot_number': <class 'sqlalchemy.sql.sqltypes.Integer'>, 'easement': <class 'sqlalchemy.sql.sqltypes.String'>, 'owner_name': <class 'sqlalchemy.sql.sqltypes.String'>, 'building_class': <class 'sqlalchemy.sql.sqltypes.String'>, 'tax_class_code': <class 'sqlalchemy.sql.sqltypes.String'>, 'lot_front': <class 'sqlalchemy.sql.sqltypes.Integer'>, '

In [10]:
dataset_info_dict

{'mapPLUTO': DatasetInfo(name='Primary Land Use Tax Lot Output - Map (MapPLUTO)', short_name='mapPLUTO', format='zip', id='f888-ni5f', main_url='https://data.cityofnewyork.us/City-Government/Primary-Land-Use-Tax-Lot-Output-Map-MapPLUTO-/f888-ni5f/about_data', metadata_url='https://data.cityofnewyork.us/api/views/f888-ni5f.json', data_url='https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/mappluto/nyc_mappluto_25v1_1_fgdb.zip', dataset_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/MapPLUTO25v1_1.gdb', data_dict_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf', standard=False, geodata=True, metadata={}, column_metadata={}, cardinality_ratios={}, data_dict_url='https://data.cityofnewyork.us/api/views/f888-ni5f/files/a5f455ae-002e-4e78-ae17-f3dcc59c236d?download=true&filename=PLUTODD22v3.pdf', other_files=[('97ca6e86-32cc-46f1-b85c-8d01e17e1602', 'PlutoReadme22v3.pdf')], attribution='Department of City 

* #### The MapPLUTO dictionary contains most of the information we need to interpret various codes and categories meaningfully.
* #### Unfortunately, it is in PDF format (as are many of the data dictionaries on NYCOpenData), which made extracting all the relevant data a real pain, and I don't expect most of these functions will be fully reusable for other PDFs I may encounter in the future. My hope is that it will still give me a head start when I need to make custom functions for future PDFs

In [11]:
# filename = '/home/james/Massive/PROJECTDATA/nyc_real_estate_data/dictionaries/mapPLUTO_data_dictionary.pdf'
filename = f"{PROJECT_DATA}/dictionaries/mapPLUTO_data_dictionary.pdf"

* Looking at the PLUTO data dictionary, it seems that most category variables are labeled as "alpahnumeric" even if they only contain numbers, such as zip codes.
* There are some exceptions, police precincts and districts are numeric and listed as such. However as there a limited number of repeating variables, I wil treat them as categorical as well.

In [12]:
pdf_by_section = map_pdf(filename, same_line_tolerance=0.3, start_page=3) 

In [13]:
category_markers = ['code', 'category', 'class', 'district', 'precinct', 'company', 'name', 'health_area', 'type', 'borough', 'name', 'health_area', 'health_center_district', 'overlay']

column_customizations=[]


for section in pdf_by_section:
    column_customizations += parse_pluto_dict_sections(section, category_markers)


The description includes a brief explanation of the field and, where pertinent, the valid
values for the field and examples.
Field Name: BOROUGH (Borough)
full_name BOROUGH
new_name borough
col_mods ColCustomization(short_name='Borough', new_name='borough', dtype=None, synonyms=[], definitions=[], drop=False, is_category=True, is_fk=False, orm=None)
Format: Alphanumeric - 2 characters
Data Source: Department of City Planning - based on data from:
Department of Finance - Property Tax System (PTS)
Description: The borough in which the tax lot is located.
This field contains a two-character borough code.
Value Description
Detected table
BX Bronx
Appended line BX Bronx
BK Brooklyn
Appended line BK Brooklyn
MN Manhattan
Appended line MN Manhattan
QN Queens
Appended line QN Queens
SI Staten Island
Appended line SI Staten Island
Two portions of the city, Marble Hill and Rikers Island, are legally located in one
Table is [('Value Description', [{'text': 'Value', 'x0': 172.8, 'x1': 199.177872, 

* Add tables from appendixes

In [14]:
table_dicts = parse_zoning(filename)

table_index is 1
missed: 48


In [15]:

# Preprocess dictionary keys by truncating last letter (for singular/plural matching)
truncated_keys = {key[:-1]: value for key, value in table_dicts.items()}

# Create a sorted list of `new_name` for efficient prefix search
sorted_new_names = sorted(item.new_name for item in column_customizations)
col_customization_dict = {item.new_name: item for item in column_customizations}

# Apply updates
for key, value in truncated_keys.items():
    print(key)
    matches = find_matching_keys(key, sorted_new_names)
    print(matches)
    for match in matches:
        col_customization_dict[match].definitions = value  # Update definitions
        col_customization_dict[match].is_category = True


special_purpose_distric
['special_purpose_district_1', 'special_purpose_district_2', 'special_purpose_district_3']


In [16]:
# Manually set BBL to not be a category
col_customization_dict['borough_tax_block_and_lot'].is_category = False

# Parse Appendix D:
### Extract the last table, which isn't actually a table, just text arranged in a table-like way.

In [17]:
# Example usage
header_x_thresh = 10
header_y_thresh = 20
body_x_thresh = 10
body_y_thresh = 10
column_gap_thresh = 20  # Adjust based on observed spacing
ncol = 3

with pdfplumber.open(filename) as pdf:
    words = pdf.pages[-1].extract_words()  # Extract words from page 0
    merged_rows = merge_words_into_rows(words, header_x_thresh, header_y_thresh, body_x_thresh, body_y_thresh)

ROWS ARE [[{'text': 'PLUTO', 'x0': 77.4, 'x1': 115.264992, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 37.864992, 'direction': 'ltr'}, {'text': 'DATA', 'x0': 118.07577600000002, 'x1': 149.19863999999998, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 31.122863999999964, 'direction': 'ltr'}, {'text': 'DICTIONARY', 'x0': 151.90344, 'x1': 224.05204799999993, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 72.14860799999994, 'direction': 'ltr'}, {'text': 'November', 'x0': 470.04, 'x1': 518.981424, 'top': 67.58112000000006, 'doctop': 41251.58112, 'bottom': 78.62112000000002, 'upright': True, 'height': 11.039999999999964, 'width': 48.94142399999993, 'direction': 'ltr'}, {'text': '2022', 'x0': 521.760192, 'x1': 543.840192, 'top': 67.5

In [18]:
last_table = []
for idx,row in enumerate(merged_rows[1:]):
    new_row = []
    for idx2,cell in enumerate(row[0]):
        new_row.append(merge_text_in_cell(cell))
    last_table.append(new_row)

In [19]:
col_customization_dict['land_use_category'].definitions = last_table

### Get explanations of zoning codes.
* I could only find this information in pdf form.
* I discovered how hard PDFs can be to parse.
* I had to do a lot of customization for just this specific pdf. I could have just manually cut and pasted the data from the pdf in the amount of time it took me to do that.
* I still think it was good to do for reproducibility reasons, but in the future I will try to avoid working with datasets that have important information only in PDF format.
* The following functions extract the tables from the pdf, detecting footnotes, and then subsitute the foonote number for the footnote text within the dataframe (so that it will end up as part of the relevant record in the databasee).

In [20]:
url = "https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf"
filename = "zoning_table_all.pdf"  # Path to save the pdf containing the info we need

downloader(
            url=url,
            download_path=f"{PROJECT_DATA}/dictionaries/",
            outfile_name=filename,
            bigfile=False,
        )

starting download session
About to try https://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf, will save to /mnt/Datasets/PROJECTDATA/nyc_real_estate_data/dictionaries/zoning_table_all.pdf
[32mSuccess downloading [36mhttps://www.nyc.gov/assets/bronxcb8/pdf/zoning_table_all.pdf[0m


'/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/dictionaries/zoning_table_all.pdf'

* Run the above functions to extract the data from the pdf.

In [21]:
tables_and_footnotes = parse_zoning_details(f"{PROJECT_DATA}/dictionaries/{filename}")

Assuming that <re.Match object; span=(1, 3), match='12'> represents two different footnotes separated by whitespaces. Also assuming only two footnotes
Assuming that <re.Match object; span=(0, 5), match='\n1\n5\n'> represents two columns of footnotes


In [22]:
tables_and_footnotes

{'ZONING DATA TABLE 1': {'footnotes': {'1': 'Up to 1.0 FAR by special permit',
   '2': 'Governed by yard requirements',
   '3': 'Front yard must be at least as deep as anadjacent front yard',
   '4': 'Zero lot line buildings require only one side yard, at least 8 feet wide',
   '5': 'Minimum of 8 ft required between buildings on adjacent zoning lots',
   '6': 'Height controlled by sky exposure plane'},
  'df': 1                                                                                          R1–1  \
  R1–R3 Lower-Density Residence Districts,                                                          
  Single-family detached residences                                                          True   
  Two-family detached residences                                                            False   
  Semi-detached residences                                                                  False   
  All residences                                                                   

In [23]:
for tablename in tables_and_footnotes.keys():
    print(tablename)
    df = tables_and_footnotes[tablename]['df']
    df.name = df.index.name
    # with engine.connect() as conn:
    for series_name, series in df.items():
        tdf = pd.DataFrame(series)
        tdf.reset_index(inplace=True)
        jstring = pd.DataFrame(tdf).to_json()
        col_customization_dict['zoning_district_1'].definitions.append([series_name, jstring])


ZONING DATA TABLE 1
ZONING DATA TABLE 2
ZONING DATA TABLE 3
ZONING DATA TABLE 4
ZONING DATA TABLE 5
ZONING DATA TABLE 6
ZONING DATA TABLE 7


### The PDF parsed above still has some definitions that are in text outside the tables. From `zoning_table_all.pdf`:

>C1-1 through C1-5 and C2-1 through C2-5 are commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations.

* I need to manually create the object to hold this information and put it in the database

In [24]:
more_zones = {}
info = "Commercial districts which are mapped as overlays within residential districts. When a commercial overlay is mapped within an R1 through R5 district, except an R5D district, the commercial FAR is 1.0; within an R5D district or an R6 through R10 district, the commercial FAR is 2.0. The residential FAR for a commercial overlay district is determined by the residential district regulations."
for i in range(1,6):
    more_zones[f'C1-{i}'] = info
    more_zones[f'C2-{i}'] = info

In [25]:
for key in more_zones.keys():
    col_customization_dict['commercial_overlay_1'].definitions.append([key, more_zones[key]])

### Get a few more code meanings 
* From [NYC Department of Tax and Finance Data Dictionary](https://www.nyc.gov/assets/finance/downloads/tar/tarfieldcodes.pdf):
    * LandUse
    * OwnerType
    * Easment code
* Additional information about commercial zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/commercial_zoning_data_tables.pdf).
* Additional information about residential zoning that I have not included can be [found here](https://www.nyc.gov/assets/planning/download/pdf/zoning/districts-tools/residence_zoning_data_tables.pdf)

## Get the meanings of the building classification codes from the City of New York website.

In [26]:
# import urllib.request #, urllib.parse, urllib.error
# from bs4 import BeautifulSoup

webpage = "https://www.nyc.gov/assets/finance/jump/hlpbldgcode.html"

trs = get_table_rows(webpage)

class_codes = []
d = None
for tr in trs:    
    # Check if 'a' with 'name' exists
    a = tr.find('a', attrs={'name': True})
    if a:
        if d:
            class_codes.append(d)
        supercategory = tr.find_all('th')[1].text.capitalize()
        d = {"supercategory": supercategory}
    
    # Check if 'td' exists and update 'd'
    cells = tr.find_all('td')
    if cells:
        d = {}
        code, name = cells[:2]
        d['code'] = code.text.strip()
        d['name'] = name.text.capitalize().strip()
        class_codes.append(d)


In [27]:
for row in class_codes:
    col_customization_dict['building_class'].definitions.append([row['code'], row['name']])

In [28]:
dataset_info_dict['mapPLUTO'].col_customizations = col_customization_dict

In [29]:
for name,info in dataset_info_dict.items():
    # print(info.col_customizations)
    if info.col_types.items():
        if not info.col_customizations:
            info.col_customizations = {short_name : ColCustomization(short_name=short_name, dtype=dtype) for short_name,dtype in info.col_types.items()}
        for key,val in info.cardinality_ratios.items():
            if val > 20 and info.col_customizations is not None and info.col_types[key] == String:
                info.col_customizations[key].is_category = True


In [30]:
info

DatasetInfo(name='DOB Certificate Of Occupancy', short_name='cert_of_occupancy', format='json', id='bs8b-p36w', main_url='https://data.cityofnewyork.us/Housing-Development/DOB-Certificate-Of-Occupancy/bs8b-p36w/about_data', metadata_url='https://data.cityofnewyork.us/api/views/bs8b-p36w.json', data_url='https://data.cityofnewyork.us/api/views/bs8b-p36w/rows.json?accessType=DOWNLOAD', dataset_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/files_to_use/cert_of_occupancy_rows.json', data_dict_path='/mnt/Datasets/PROJECTDATA/nyc_real_estate_data/dictionaries/cert_of_occupancy_data_dictionary.xlsx', standard=True, geodata=False, metadata={'id': 'bs8b-p36w', 'name': 'DOB Certificate Of Occupancy', 'assetType': 'dataset', 'averageRating': 0, 'category': 'Housing & Development', 'createdAt': 1498850799, 'description': 'A Certificate of Occupancy (CO) states a building’s legal use and/or type of permitted occupancy. New buildings must have a CO, and existing buildings must have a current 

In [31]:
with open("environment_data/table_dicts.pkl", "wb") as f:
    dill.dump(
        {
            "dataset_info_dict": dataset_info_dict,
            "PROJECT_PATH": PROJECT_PATH,
            "PROJECT_DATA": PROJECT_DATA,
            "SQLITE_PATH": SQLITE_PATH,
            "DATADIR": DATADIR,
            "PROJECT_NAME": PROJECT_NAME,
            "PROJECT_DATA": PROJECT_DATA,
        },
        f,
    )