In [1]:
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import shared_functions as sf

In [2]:
# Load data
df = pd.read_parquet(f'../data/raw/property-sales_new-york-city_2022_geocoded.parquet')

In [3]:
# Use most recent pandas data types (e.g. pd.NA)
df = df.convert_dtypes()

In [4]:
# Generate data overview
data_overview = sf.data_overview(df)

In [5]:
# Assign variable type to each column
variable_types = {
    'categorical': [
        'borough',
        'neighborhood',
        'building_class_category',
        'tax_class_at_present',
        'block',
        'lot',
        'easement',
        'building_class_at_present',
        'address',
        'apartment_number',
        'zip_code',
        'tax_class_at_time_of_sale',
        'building_class_at_time_of_sale'
        ],
    'numerical': [
        'residential_units',
        'commercial_units',
        'total_units',
        'land_square_feet',
        'gross_square_feet',
        'year_built',
        'sale_price',
        'location_lat',
        'location_long'
        ],
    'date': [
        'sale_date'
        ]
    }

In [6]:
# Create auxiliary list for sanity checks
auxiliary_list = [column for variable_type in variable_types.values() for column in variable_type]

In [7]:
# Check that all columns were only assigned one variable type
len(auxiliary_list) == len(set(auxiliary_list))

True

In [8]:
# Check that all columns were assigned a variable type
set(df.columns) == set(auxiliary_list)

True

In [9]:
# Add variable type to data overview
for variable_type, columns in variable_types.items():
    for column in columns:
        data_overview.loc[data_overview.column == column, 'variable_type'] = variable_type

In [10]:
# Save data overview
data_overview.to_csv('../data/raw/property-sales_new-york-city_2022_geocoded_data-overview.csv')

In [11]:
# Calculate image size when image is cropped or resized
# sf.get_image_size(640, 640, 16, 40)
# 600/640
# 0.9375 * 1171.0832153277483
# location_lat, location_long = df.iloc[1:2].location_lat, df.iloc[1:2].location_long
# width_m, height_m = sf.get_image_size(640, 640, 17, location_lat)
# lat_top, lat_bottom, long_left, long_right = sf.get_image_boundaries(location_lat, location_long, width_m, height_m)