## Import dependencies

In [None]:
# HELPERS
import helpers
import importlib
importlib.reload(helpers)

# OPERATING SYSTEM STUFF
import os
import io
import gc

# BASIC DATA SCIENCE
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# MACHINE LEARNING
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# MODEL PACKAGING
import joblib

# API STUFF
import xlrd
import requests
import json

# SQL
import time
from sqlalchemy import create_engine, text, String, Integer, Float, Boolean, MetaData, Table, select
from sqlalchemy.exc import ProgrammingError # ProgrammingError catches SQL write exceptions
from sqlalchemy.sql import and_

# GEOCODING
from geopy.geocoders import GoogleV3

# CONFIGURATION FILES
import config
importlib.reload(config)
pd.set_option('display.float_format', '{:.6f}'.format)

# OTHER
from tqdm.notebook import tqdm

## Create databases / database connections

In [42]:
# Silence errors
os.environ['SQLALCHEMY_WARN_20'] = '0'
os.environ['SQLALCHEMY_SILENCE_UBER_WARNING'] = '1'

# Database params & credentials
username = config.DB_USERNAME
password = config.DB_PASSWORD
hostname = config.DB_HOSTNAME
database_name = config.DB_NAME

# Table names
geocodes_sql_table_name = 'geocodes'
sales_sql_table_name = 'sales'

In [43]:
# Attempt to establish a connection to the database
engine = helpers.connect_to_database(username, password, hostname)

if engine is not None:
    # See file `helpers.py` for function documentation 
    engine = helpers.create_database(engine, database_name)
    helpers.silence_warnings()
    helpers.create_table_from_csv(engine, 'geocodes', 'geocodes_export_backup_1.csv')
    helpers.add_primary_key(engine, 'geocodes', 'PRIMARY_KEY')
    helpers.set_primary_key(engine, 'geocodes', 'PRIMARY_KEY', "`BOROUGH`, '_', `ADDRESS`")

Database connection established successfully.
Table 'geocodes' already exists. Not resetting!
Column PRIMARY_KEY already exists in table geocodes.
PRIMARY_KEY column values set in table geocodes.


## Download new sales data from NYC

In [44]:
# Create an empty array that will hold our NYC Housing DataFrames
data = []

# Pull data from the NYC website
for url in helpers.dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [45]:
# Combine the dataframes from the nyc housing website
combined = pd.concat(data, ignore_index=True)

# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={'BOROUGH': 'BOROUGH CODE'})

# Define the mapping for borough codes to borough names
borough_mapping = {1: 'MANHATTAN', 2: 'BRONX', 3: 'BROOKLYN', 4: 'QUEENS', 5: 'STATEN ISLAND'}

# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined['BOROUGH CODE'].map(borough_mapping)

# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column='BOROUGH', value=borough)

## Filter new sales data

In [46]:
# Remove rows that contain the string 'N/A' anywhere in the address column...
combined = combined[~combined['ADDRESS'].str.contains('N/A')]

# Define thresholds for "close to zero"
thresholds = {
    'SALE PRICE': 100000,
    'GROSS SQUARE FEET': 100,
    'LAND SQUARE FEET': 100
}

#Remove rows with values "close to zero"
data_clean = combined.copy()
for col, threshold in thresholds.items():
    data_clean = data_clean[data_clean[col] >= threshold]

# List of columns to remove outliers from
cols_to_check = list(thresholds.keys())

# Remove outliers
for col in cols_to_check:
    # Calculate the IQR of each column
    Q1 = data_clean[col].quantile(0.25)
    Q3 = data_clean[col].quantile(0.75)
    IQR = Q3 - Q1

    # Define the upper and lower bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Remove outliers
    data_clean = data_clean[(data_clean[col] >= lower_bound) & (data_clean[col] <= upper_bound)]

# Reset `combined` to the clean data
combined = data_clean

In [47]:
# Plot new distributions for sanity check
'''
# Create histograms for each column
fig, axs = plt.subplots(1, len(cols_to_check), figsize=(15, 5))
x
# Create histograms for each column
for i, col in enumerate(cols_to_check):
    axs[i].hist(data_clean[col].dropna(), bins=30, edgecolor='black')
    axs[i].set_title(f'{col}')

# Tight layout
plt.tight_layout()
plt.show()
'''

"\n# Create histograms for each column\nfig, axs = plt.subplots(1, len(cols_to_check), figsize=(15, 5))\nx\n# Create histograms for each column\nfor i, col in enumerate(cols_to_check):\n    axs[i].hist(data_clean[col].dropna(), bins=30, edgecolor='black')\n    axs[i].set_title(f'{col}')\n\n# Tight layout\nplt.tight_layout()\nplt.show()\n"

In [48]:
# Write the contents of `combined` to the `sales` SQL table...
with engine.connect() as connection:
    combined.to_sql(sales_sql_table_name, con=engine, index=False, if_exists='replace')

# Update geocodes table

### Set up tables for comparison

In [49]:
# Create a DataFrame of just geo-columns from NYC data
geocodes_local = combined[['BOROUGH CODE', 'BOROUGH', 'NEIGHBORHOOD', 'ADDRESS']].copy()
geocodes_local['LATITUDE'], geocodes_local['LONGITUDE'], geocodes_local['GEOCODING ERR'] = None, None, False
geocodes_local['PRIMARY_KEY'] = geocodes_local['BOROUGH'] + '_' + geocodes_local['ADDRESS']

# Load geocodes SQL table into a DataFrame
geocodes_table_response = pd.read_sql_query(f"SELECT * FROM {geocodes_sql_table_name}", engine)

In [50]:
# Find rows in NYC data not in our existing geocoding data
missing_rows = geocodes_local[~geocodes_local['PRIMARY_KEY'].isin(geocodes_table_response['PRIMARY_KEY'])]

In [51]:
# Geocode the rows missing from the SQL table
tqdm.pandas()
missing_rows = missing_rows.progress_apply(helpers.geolocate, axis=1)

0it [00:00, ?it/s]

In [52]:
# Set the index on the dataframe so that we ensure we don't have duplicates
missing_rows.drop_duplicates(subset='PRIMARY_KEY', keep='first', inplace=True)
missing_rows.set_index('PRIMARY_KEY', inplace=True)

In [53]:
# Add the missing rows back to the SQL table with the geocodes
with engine.connect() as connection:
    missing_rows.to_sql(geocodes_sql_table_name, con=engine, if_exists='append', index=True)

In [54]:
# Test to see if the append worked. If `missing_rows` is empty, it did.
with engine.connect() as connection:
    geocodes_table_response = pd.read_sql_query(f"SELECT * FROM {geocodes_sql_table_name}", engine)

missing_rows = geocodes_local[~geocodes_local['PRIMARY_KEY'].isin(geocodes_table_response['PRIMARY_KEY'])]

In [55]:
# Merge the DataFrames on 'BOROUGH' and 'ADDRESS'
combined = combined.merge(geocodes_table_response[['BOROUGH', 'ADDRESS', 'LATITUDE', 'LONGITUDE']], 
                          on=['BOROUGH', 'ADDRESS'], 
                          how='left', 
                          suffixes=('', '_y'))

# The merge could result in duplicate 'LATITUDE' and 'LONGITUDE'
# columns if they exist in the `combined` dataframe.
# We'll handle this by dropping the duplicate columns.

# List of duplicate columns
duplicate_columns = ['LATITUDE_y', 'LONGITUDE_y']

# Drop duplicate columns from `combined`
combined = combined.drop(columns=duplicate_columns, errors='ignore')

## Build mapping between NYC and Zillow housing categories

In [56]:
# First, create the inverted mapping dictionary
# invert_mapping = {building_class: zillow_cat for zillow_cat, building_class_list in helpers.category_mapping.items() for building_class in building_class_list}

category_mapping = helpers.category_mapping

# Then, use the map function to create the new column
combined['GROUPED CATEGORY'] = combined['BUILDING CLASS CATEGORY'].map(category_mapping)

# Check if there are any missing values in the new column (i.e., categories that couldn't be mapped)
if combined['GROUPED CATEGORY'].isna().any():
    combined = combined.dropna(subset=['GROUPED CATEGORY'])
    print("Warning: some categories were not be mapped, those rows were dropped.")

combined.to_csv('for_model.csv', index=False)



## Choosing features

In [62]:
# Select the features we are interested in
selected_features = ['BOROUGH CODE', #'ZIP CODE',
                     'GROSS SQUARE FEET', 'LAND SQUARE FEET', 'GROUPED CATEGORY', 
                     'LATITUDE', 'LONGITUDE', 'SALE PRICE']

# Create a new DataFrame with only these features
df = combined[selected_features]

# Check for missing values
df.isnull().sum()

BOROUGH CODE           0
GROSS SQUARE FEET      0
LAND SQUARE FEET       0
GROUPED CATEGORY       0
LATITUDE             814
LONGITUDE            814
SALE PRICE             0
dtype: int64

In [63]:
# Drop rows with missing latitude or longitude
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Check again for missing values
df.isnull().sum()

BOROUGH CODE         0
GROSS SQUARE FEET    0
LAND SQUARE FEET     0
GROUPED CATEGORY     0
LATITUDE             0
LONGITUDE            0
SALE PRICE           0
dtype: int64

## Encode using a `scikit-learn` encoder

In [64]:
# Define the columns to be scaled and one-hot encoded
cols_to_encode = ['BOROUGH CODE','GROUPED CATEGORY']

cols_to_scale = ['GROSS SQUARE FEET',
                 'LAND SQUARE FEET',
                 'LATITUDE',
                 'LONGITUDE',
                 'SALE PRICE']

# Initialize the transformers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', scaler, cols_to_scale),
        ('ohe', ohe, cols_to_encode)])

# Apply the transformations
df_processed = preprocessor.fit_transform(df)

# Get the feature names after one-hot encoding
ohe_feature_names = list(preprocessor.named_transformers_['ohe'].get_feature_names(input_features=cols_to_encode))

# Combine the feature names
feature_names = cols_to_scale + ohe_feature_names

# Convert the array back into a DataFrame
df_processed = pd.DataFrame(df_processed, columns=feature_names)

# Drop rows with NaN values
df_processed = df_processed.dropna()

# Display the first few rows of the processed DataFrame
#df_processed.head()

df_encoded = df_processed


## Split the data into features and target

In [66]:
# Split the data into features and target
X = df_encoded.drop('SALE PRICE', axis=1)
y = df_encoded['SALE PRICE']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
X_train_scaled = X_train
X_test_scaled = X_test

X_train_scaled.shape, X_test_scaled.shape

((15792, 13), (3949, 13))

## Define random forest model

In [67]:
# Define the model
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the training set and calculate the MAE
y_train_pred = model.predict(X_train_scaled)
mae_train = mean_absolute_error(y_train, y_train_pred)

# Make predictions on the test set and calculate the MAE
y_test_pred = model.predict(X_test_scaled)
mae_test = mean_absolute_error(y_test, y_test_pred)

mae_train, mae_test

(0.35992026281957035, 0.4265471961128181)

## Package up the model

In [68]:
# Dump the model to a shared docker volume...
joblib.dump(model, 'model/model.joblib')

['model/model.joblib']

In [69]:
# Save the model
joblib.dump(model, 'model.joblib')
# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

# Save the model
joblib.dump(model, './model/model.joblib')
# Save the preprocessor
joblib.dump(preprocessor, './model/preprocessor.joblib')

['./model/preprocessor.joblib']