In [217]:
# Load packages
import numpy as np
import pandas as pd

In [218]:
# Define file path and file name
import_path = '../data/raw/'
export_path = '../data/processed/'
file_name = 'property-sales_new-york-city_2022'


In [219]:
# Load data
df = pd.read_parquet(f'{import_path}{file_name}_geocoded.parquet')

In [220]:
# Drop columns with many NAs
df.drop(['easement', 'apartment_number'], axis=1, inplace=True)

In [221]:
# Drop all rows with NAs
df.dropna(inplace=True)

In [222]:
# Move sale price column to the front
col = df.pop('sale_price')
df.insert(0, col.name, col)

In [223]:
# Transform dates into timestamps
df.year_built = pd.to_datetime(df.year_built.astype(str)).astype(int) / 10**9
df.sale_date = df.sale_date.astype(int) / 10**9

In [224]:
# Change dtype to object for all categorical features
df = df.astype('object')
non_object_cols = [
    'sale_price', 'residential_units', 'commercial_units', 'total_units', 'land_square_feet', 'gross_square_feet', 'year_built', 'sale_date', 'location_lat', 'location_long'
    ]
df[non_object_cols] = df[non_object_cols].convert_dtypes()

In [225]:
# Drop categorical features with too many categories for hot encoding
df.drop(['address', 'block'], axis=1, inplace=True)

In [226]:
df.dtypes

sale_price                          Int64
borough                            object
neighborhood                       object
building_class_category            object
tax_class_at_present               object
lot                                object
building_class_at_present          object
zip_code                           object
residential_units                   Int64
commercial_units                    Int64
total_units                         Int64
land_square_feet                    Int64
gross_square_feet                   Int64
year_built                          Int64
tax_class_at_time_of_sale          object
building_class_at_time_of_sale     object
sale_date                           Int64
location_lat                      Float64
location_long                     Float64
dtype: object

In [227]:
# Hot eoncode categorical variables
df = pd.get_dummies(df)

  df = pd.get_dummies(df)
  df = pd.get_dummies(df)
  df = pd.get_dummies(df)
  df = pd.get_dummies(df)


In [228]:
df.memory_usage().sum() / 10**6

66.572523

In [229]:
# Filter out sales of 0
df = df[df.sale_price != 0]

In [230]:
# Save preprocessed data
df.to_csv(f'{export_path}{file_name}_processed.csv')
df.to_parquet(f'{export_path}{file_name}_processed.parquet')