## Import dependencies

In [51]:
# HELPERS
import helpers
import importlib
importlib.reload(helpers)

# OPERATING SYSTEM STUFF
import os
import io
import gc

# BASIC STUFF
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# MACHINE LEARNING
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# MODEL PACKAGING
import joblib

# API STUFF
import xlrd
import requests
import json

# SQL STUFF
import time
from sqlalchemy import create_engine, text, String, Integer, Float, Boolean, MetaData, Table, select
from sqlalchemy.exc import ProgrammingError # ProgrammingError catches SQL write exceptions
from sqlalchemy.sql import and_

# GEOCODING STUFF
from geopy.geocoders import GoogleV3

# CONFIGURATION STUFF
import config
importlib.reload(config)
pd.set_option('display.float_format', '{:.6f}'.format)

# OTHER STUFF
from tqdm.notebook import tqdm

## Create databases / database connections

In [52]:
# Silence errors
os.environ['SQLALCHEMY_WARN_20'] = '0'
os.environ['SQLALCHEMY_SILENCE_UBER_WARNING'] = '1'

# Database params & credentials
username = config.DB_USERNAME
password = config.DB_PASSWORD
hostname = config.DB_HOSTNAME
database_name = config.DB_NAME

# Table names
geocodes_sql_table_name = 'geocodes'
sales_sql_table_name = 'sales'

In [53]:
# Attempt to establish a connection to the database
engine = helpers.connect_to_database(username, password, hostname)

if engine is not None:
    # See file `helpers.py` for function documentation 
    engine = helpers.create_database(engine, database_name)
    helpers.silence_warnings()
    helpers.create_table_from_csv(
        engine, 'geocodes', 'geocodes_export_backup.csv')
    helpers.add_primary_key(
        engine, 'geocodes', 'PRIMARY_KEY')
    helpers.set_primary_key(
        engine, 'geocodes', 'PRIMARY_KEY', "`BOROUGH`, '_', `ADDRESS`")

Database connection established successfully.
Table geocodes already exists
Table 'geocodes' created from csv 'geocodes_export_backup.csv' successfully, or already exists.
Column PRIMARY_KEY already exists in table geocodes.
PRIMARY_KEY column values set in table geocodes.


## Download new sales data from NYC

In [54]:
# Create an empty array that will hold our NYC Housing DataFrames
data = []

# Pull data from the NYC website
for url in helpers.dataURLs:
    # Read Excel file and skip the first 4 rows
    df = pd.read_excel(url, skiprows=4, engine="openpyxl")
    data.append(df)

In [55]:
combined = helpers.combineHousingDataSets(data)

In [56]:
# Rename the 'BOROUGH' column to 'BOROUGH CODE'
combined = combined.rename(columns={"BOROUGH": "BOROUGH CODE"})

# Define the mapping for borough codes to borough names
borough_mapping = {
    1: "MANHATTAN",
    2: "BRONX",
    3: "BROOKLYN",
    4: "QUEENS",
    5: "STATEN ISLAND",
}

# Create a new 'BOROUGH' column based on 'BOROUGH CODE'
borough = combined["BOROUGH CODE"].map(borough_mapping)

# Insert the new 'BOROUGH' column into the DataFrame right after the 'BOROUGH CODE' column
combined.insert(loc=1, column="BOROUGH", value=borough)

## Filter new sales data

In [57]:
# Remove rows that contain the string 'N/A' anywhere in the address column...
combined = combined[~combined['ADDRESS'].str.contains('N/A')]

# Define thresholds for "close to zero"
thresholds = {
    'SALE PRICE': 100000,
    'GROSS SQUARE FEET': 100,
    'LAND SQUARE FEET': 100
}

# Filter outliers
combined = helpers.filterOutliers(combined, thresholds, 0.15, 0.99)

In [58]:
# Plot new distributions for sanity check
"""
# Create histograms for each column
fig, axs = plt.subplots(1, len(cols_to_check), figsize=(15, 5))
x
# Create histograms for each column
for i, col in enumerate(cols_to_check):
    axs[i].hist(data_clean[col].dropna(), bins=30, edgecolor='black')
    axs[i].set_title(f'{col}')

# Tight layout
plt.tight_layout()
plt.show()
"""

"\n# Create histograms for each column\nfig, axs = plt.subplots(1, len(cols_to_check), figsize=(15, 5))\nx\n# Create histograms for each column\nfor i, col in enumerate(cols_to_check):\n    axs[i].hist(data_clean[col].dropna(), bins=30, edgecolor='black')\n    axs[i].set_title(f'{col}')\n\n# Tight layout\nplt.tight_layout()\nplt.show()\n"

In [59]:
# Write the contents of `combined` to the `sales` SQL table...
# FIXME: Is this necessary? Can we do away with the sales table SQL stuff
with engine.connect() as connection:
    combined.to_sql(sales_sql_table_name, con=engine, index=False, if_exists='replace')

# Update geocodes table

### Set up tables for comparison

In [60]:
with engine.connect() as connection:
    #missing_rows = helpers.check_missing_rows(combined, geocodes_sql_table_name, engine)
    missing_rows = helpers.check_missing_rows(combined, geocodes_sql_table_name, engine).head(5)

if missing_rows is not False:
    tqdm.pandas()
    try:
        missing_rows = missing_rows.progress_apply(lambda x: helpers.geolocate(x, config.GOOGLE_API_KEY), axis=1)
    except ValueError as err:
        print(err)
        print("We'll work with old data for now...")
        missing_rows = missing_rows.drop(missing_rows.index)

  0%|          | 0/5 [00:00<?, ?it/s]

In [61]:
# Set the index on the dataframe ensuring we don't have duplicates
missing_rows.drop_duplicates(subset='PRIMARY_KEY', keep='first', inplace=True)

In [62]:
# Add the missing rows back to the SQL table with the geocodes
with engine.connect() as connection:
    missing_rows.to_sql(geocodes_sql_table_name, con=engine, if_exists='append', index=False)

In [63]:
# Test to see if the append worked.
# If ValueError is not raised, then the append did not work.
with engine.connect() as connection:
    # Resets the index
    missing_rows.reset_index(drop=False, inplace=True)
    if not helpers.is_local_sql_subset(connection, missing_rows, geocodes_sql_table_name):
        raise ValueError(
            "Error appending local geocode data to SQL table.\
            Local geocode table not a subset of SQL geocode table."
        )

In [64]:
# Pull geocodes back down from SQL table
with engine.connect() as connection:
        geocodes_table_response = pd.read_sql_query(
            f"SELECT * FROM {geocodes_sql_table_name}", engine
        )

In [65]:
# Create primary key and merge geocodes on it
combined['PRIMARY_KEY'] = combined['BOROUGH'].astype(str) + "_" + combined['ADDRESS'].astype(str)
combined = combined.merge(geocodes_table_response[['PRIMARY_KEY', 'LATITUDE', 'LONGITUDE']], 
                          on='PRIMARY_KEY', 
                          how='left')

## Build mapping between NYC and Zillow housing categories

In [66]:
# First, create the inverted mapping dictionary
# invert_mapping = {building_class: zillow_cat for zillow_cat, building_class_list in helpers.category_mapping.items() for building_class in building_class_list}

category_mapping = helpers.category_mapping

# Then, use the map function to create the new column
combined['GROUPED CATEGORY'] = combined['BUILDING CLASS CATEGORY'].map(category_mapping)

# Check if there are any missing values in the new column (i.e., categories that couldn't be mapped)
if combined['GROUPED CATEGORY'].isna().any():
    combined = combined.dropna(subset=['GROUPED CATEGORY'])
    print("Warning: some categories were not be mapped, those rows were dropped.")

combined.to_csv('for_model.csv', index=False)



## Choosing features

In [67]:
# Select the features we are interested in
selected_features = ['BOROUGH CODE', #'ZIP CODE',
                     'GROSS SQUARE FEET', 'LAND SQUARE FEET', 'GROUPED CATEGORY', 
                     'LATITUDE', 'LONGITUDE', 'SALE PRICE']

# Create a new DataFrame with only these features
df = combined[selected_features]

# Check for missing values
#df.isnull().sum()

In [68]:
# Drop rows with missing latitude or longitude
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Check again for missing values
#df.isnull().sum()

## Encode using a `scikit-learn` encoder

In [69]:
# Define the columns to be scaled and one-hot encoded
cols_to_encode = ['BOROUGH CODE','GROUPED CATEGORY']

cols_to_scale = ['GROSS SQUARE FEET',
                 'LAND SQUARE FEET',
                 'LATITUDE',
                 'LONGITUDE',
                 'SALE PRICE']

# Initialize the transformers
scaler = StandardScaler()
ohe = OneHotEncoder(sparse=False)

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', scaler, cols_to_scale),
        ('ohe', ohe, cols_to_encode)])

# Apply the transformations
df_processed = preprocessor.fit_transform(df)

# Get the feature names after one-hot encoding
ohe_feature_names = list(preprocessor.named_transformers_['ohe'].get_feature_names(input_features=cols_to_encode))

# Combine the feature names
feature_names = cols_to_scale + ohe_feature_names

# Convert the array back into a DataFrame
df_processed = pd.DataFrame(df_processed, columns=feature_names)

# Drop rows with NaN values
df_processed = df_processed.dropna()

# Display the first few rows of the processed DataFrame
#df_processed.head()

df_encoded = df_processed


## Split the data into features and target

In [70]:
# Split the data into features and target
X = df_encoded.drop('SALE PRICE', axis=1)
y = df_encoded['SALE PRICE']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
X_train_scaled = X_train
X_test_scaled = X_test

# X_train_scaled.shape, X_test_scaled.shape

## Define random forest model

In [71]:
# Define the model
model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the training set and calculate the MAE
y_train_pred = model.predict(X_train_scaled)
mae_train = mean_absolute_error(y_train, y_train_pred)

# Make predictions on the test set and calculate the MAE
y_test_pred = model.predict(X_test_scaled)
mae_test = mean_absolute_error(y_test, y_test_pred)

# mae_train, mae_test

## Package up the model

In [72]:
# Dump the model to a shared docker volume...
joblib.dump(model, 'model/model.joblib')

# Save the model
joblib.dump(model, 'model.joblib')
# Save the preprocessor
joblib.dump(preprocessor, 'preprocessor.joblib')

# Save the model
joblib.dump(model, './model/model.joblib')
# Save the preprocessor
joblib.dump(preprocessor, './model/preprocessor.joblib')

['./model/preprocessor.joblib']

In [76]:
df[['LATITUDE', 'LONGITUDE', 'SALE PRICE']].to_csv('listings_with_price.csv', index=False)

In [75]:
df

Unnamed: 0,BOROUGH CODE,GROSS SQUARE FEET,LAND SQUARE FEET,GROUPED CATEGORY,LATITUDE,LONGITUDE,SALE PRICE
0,1,4400.000000,2116.000000,Single-family home,40.721665,-73.978312,399000
1,1,2790.000000,1503.000000,Duplex,40.724210,-73.978491,2999999
2,1,8625.000000,2204.000000,Apartment,40.721688,-73.979215,16800000
3,1,8625.000000,2204.000000,Apartment,40.721631,-73.979227,16800000
4,1,9750.000000,2302.000000,Apartment,40.723224,-73.978226,158822
...,...,...,...,...,...,...,...
24405,5,1760.000000,2379.000000,Duplex,40.537036,-74.218932,695000
24406,5,2400.000000,3147.000000,Duplex,40.536586,-74.222383,625000
24407,5,2400.000000,3147.000000,Duplex,40.536586,-74.222383,815000
24408,5,1176.000000,4600.000000,Duplex,40.535577,-74.218552,975000
