In [19]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from geopy.geocoders import Nominatim
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
color = sns.color_palette()
pd.options.mode.chained_assignment = None  # default='warn'

In [20]:
df_experiment = pd.read_csv("experiment.csv") 

In [21]:
df_experiment.head(2)

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AS OF FINAL ROLL 18/19,BLOCK,LOT,BUILDING CLASS AS OF FINAL ROLL 18/19,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,AREA,FINAL_ADDRESS
0,3,FLATBUSH-LEFFERTS GARDEN,01 ONE FAMILY DWELLINGS,1,5050,2,S0,625 ROGERS AVENUE,11225.0,1.0,2.0,3.0,3700.0,2128.0,1905.0,1,S0,1925000,2019-02-05,brooklyn,625 ROGERS AVENUE NYC
1,4,HOWARD BEACH,01 ONE FAMILY DWELLINGS,1,14069,139,A1,85-11 164TH AVENUE,11414.0,1.0,0.0,1.0,4000.0,2192.0,1965.0,1,A1,730000,2019-07-02,queens,85-11 164TH AVENUE NYC


In [22]:
df_experiment_lat_lng = pd.read_csv("experiment_with_lat_and_lng.csv") 

In [23]:
df_experiment_lat_lng.head(2)

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AS OF FINAL ROLL 18/19,BLOCK,LOT,BUILDING CLASS AS OF FINAL ROLL 18/19,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,AREA,FINAL_ADDRESS,lat,lng
0,3,FLATBUSH-LEFFERTS GARDEN,01 ONE FAMILY DWELLINGS,1,5050,2,S0,625 ROGERS AVENUE,11225.0,1.0,2.0,3.0,3700.0,2128.0,1905.0,1,S0,1925000,2019-02-05,brooklyn,625 ROGERS AVENUE NYC,40.656321,-73.952859
1,4,HOWARD BEACH,01 ONE FAMILY DWELLINGS,1,14069,139,A1,85-11 164TH AVENUE,11414.0,1.0,0.0,1.0,4000.0,2192.0,1965.0,1,A1,730000,2019-07-02,queens,85-11 164TH AVENUE NYC,40.650046,-73.844749


In [24]:
cat = ['NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AS OF FINAL ROLL 18/19', 'ADDRESS',
           'BUILDING CLASS AT TIME OF SALE', 'SALE DATE', 'AREA', 'FINAL_ADDRESS']

for col in cat:
    lb = LabelEncoder()
    df_experiment[col] = lb.fit_transform(df_experiment[col].values)

df_experiment['TAX CLASS AS OF FINAL ROLL 18/19'] = df_experiment['TAX CLASS AS OF FINAL ROLL 18/19']\
    .map({'2A': 2, '1': 1, '4': 4})

In [25]:
cat = ['NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AS OF FINAL ROLL 18/19', 'ADDRESS',
           'BUILDING CLASS AT TIME OF SALE', 'SALE DATE', 'AREA', 'FINAL_ADDRESS']

for col in cat:
    lb = LabelEncoder()
    df_experiment_lat_lng[col] = lb.fit_transform(df_experiment_lat_lng[col].values)

df_experiment_lat_lng['TAX CLASS AS OF FINAL ROLL 18/19'] = df_experiment_lat_lng['TAX CLASS AS OF FINAL ROLL 18/19']\
    .map({'2A': 2, '1': 1, '4': 4})

In [26]:
df_experiment.head(2)

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AS OF FINAL ROLL 18/19,BLOCK,LOT,BUILDING CLASS AS OF FINAL ROLL 18/19,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,AREA,FINAL_ADDRESS
0,3,20,0,1,5050,2,11,77,11225.0,1.0,2.0,3.0,3700.0,2128.0,1905.0,1,11,1925000,9,1,77
1,4,29,0,1,14069,139,1,90,11414.0,1.0,0.0,1.0,4000.0,2192.0,1965.0,1,1,730000,37,2,90


In [27]:
df_experiment_lat_lng.head(2)

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AS OF FINAL ROLL 18/19,BLOCK,LOT,BUILDING CLASS AS OF FINAL ROLL 18/19,ADDRESS,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,AREA,FINAL_ADDRESS,lat,lng
0,3,20,0,1,5050,2,11,77,11225.0,1.0,2.0,3.0,3700.0,2128.0,1905.0,1,11,1925000,9,1,77,40.656321,-73.952859
1,4,29,0,1,14069,139,1,90,11414.0,1.0,0.0,1.0,4000.0,2192.0,1965.0,1,1,730000,37,2,90,40.650046,-73.844749


In [28]:
for col in cat:
    minmax = MinMaxScaler()
    df_experiment[col] = minmax.fit_transform(df_experiment[col].values.reshape(-1, 1))

In [29]:
for col in cat:
    minmax = MinMaxScaler()
    df_experiment_lat_lng[col] = minmax.fit_transform(df_experiment_lat_lng[col].values.reshape(-1, 1))

In [34]:
df_train, df_test = train_test_split(df_experiment, test_size=0.2, random_state=42, shuffle=True)
train_target = df_train['SALE PRICE']
train = df_train.drop(['SALE PRICE'], axis=1)
test_target = df_test['SALE PRICE']
test = df_test.drop(['SALE PRICE'], axis=1)

In [35]:
rf = RandomForestRegressor(n_estimators=300, verbose=True, max_depth=20, n_jobs=-1)
rf.fit(train, train_target)
res_pred = rf.predict(test)
rms = np.sqrt(mean_squared_error(test_target, res_pred))
print("RMS: %f" % rms)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.5s finished


RMS: 365917.234244


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:    0.1s finished


In [36]:
df_train, df_test = train_test_split(df_experiment_lat_lng, test_size=0.2, random_state=42, shuffle=True)
train_target = df_train['SALE PRICE']
train = df_train.drop(['SALE PRICE'], axis=1)
test_target = df_test['SALE PRICE']
test = df_test.drop(['SALE PRICE'], axis=1)

In [37]:
rf = RandomForestRegressor(n_estimators=300, verbose=True, max_depth=20, n_jobs=-1)
rf.fit(train, train_target)
res_pred = rf.predict(test)
rms = np.sqrt(mean_squared_error(test_target, res_pred))
print("RMS: %f" % rms)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.5s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.


RMS: 344383.649461


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:    0.1s finished
