In [1]:
#This notebook uses a dataset with house sale amounts and dates
#to create a model which can predict any house's sale price using its characteristics

#The final model has an accuracy of ___ and an ROC of ___

In [2]:
#IMPORT LIBRARIES

In [3]:
import os; import pandas as pd; import numpy as np

In [4]:
#LOAD DATA

In [5]:
master = pd.read_csv('nyc-rolling-sales.csv') #dataset with 8 variables, target is called 'SALE PRICE'

backup = master

In [6]:
#START RUNNING CODE FROM HERE

master = backup
master.shape

(84548, 22)

In [7]:
#remove INF and NaNs, replace with special value            #EDIT: no missing values but lots of garbage

master = master.replace([np.inf, -np.inf, ' -  '], np.nan)
master = master.dropna()
master.shape

(48244, 22)

In [8]:
#reduce date to just year                       #split out time, then split out month and year

master['SALE YEAR'] = master['SALE DATE'].apply(lambda x: x.split()[0].split('/')[2])
master['SALE MONTH'] = master['SALE DATE'].apply(lambda x: x.split()[0].split('/')[0])

In [9]:
#check year and month, drop date

master = master.drop(columns='SALE DATE')
master.shape

(48244, 23)

In [10]:
#reduce number of rows           #NOTE: lots of garbage values in SALE PRICE, need to be cleaned out

master = master[pd.to_numeric(master['SALE PRICE'], errors='coerce') > 25000]
master.shape

(36677, 23)

In [11]:
# convert target (SALE PRICE) to int64 column

master['SALE PRICE'] = master['SALE PRICE'].astype('int64')

In [12]:
#reduce number of rows           #just take sample

master = master.sample(frac=0.2)
master.shape

(7335, 23)

In [13]:
#IDENTIFY RELEVANT COLUMNS

data = master.columns.drop(['new_id', 'SALE PRICE'])
dataset = master[data]
dataset.shape

(7335, 21)

In [14]:
#Inspect columns for str values

# for col in dataset.columns:
#     print(dataset[col].unique())

In [15]:
#use dummy encoder to convert categorical variables to indicators

nonNumeric = data.drop(dataset[data].select_dtypes('number').columns)

dataNon = dataset[nonNumeric]

dataDummy = pd.get_dummies(dataNon)

In [16]:
#replace categorical variables with indicator variables 

dataset = dataset.drop(columns=nonNumeric)

dataset[dataDummy.columns] = dataDummy

In [17]:
#update data then proceed to model

data = dataset.columns
dataset.shape

(7335, 11656)

In [18]:
#SPLIT INTO TRAIN AND TEST

from sklearn.model_selection import train_test_split

trainData, testData, trainTarget, testTarget = train_test_split(dataset, master['SALE PRICE'], 
                                                                test_size = 0.4, random_state = 42)


In [19]:
#sanity check

trainTarget.dtypes

dtype('int64')

In [20]:
#   4-STEP MODELLING PROCESS: IMPORT WHICH MODEL, MAKE INSTANCE OF MODEL, TRAIN USING FIT, PREDICT LABELS OF TESTDATA

In [21]:
from xgboost import XGBRegressor

In [22]:
model = XGBRegressor(learning_rate=0.01)

In [23]:
model.fit(trainData, trainTarget)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [24]:
#perform feature reduction based on feature importances

from sklearn.feature_selection import SelectFromModel

thresh = -np.sort(-model.feature_importances_)[69]
selection = SelectFromModel(model, threshold=thresh, prefit=True)
selected = selection.transform(trainData)

In [25]:
#build new model on reduced features

newModel = XGBRegressor(learning_rate=0.01)
newModel.fit(selected, trainTarget)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [26]:
#predict prices

prediction = newModel.predict(selection.transform(testData))
prediction

array([718511.2, 718511.2, 718511.2, ..., 718511.2, 718511.2, 718511.2],
      dtype=float32)

In [27]:
#EVALUATE MODEL ON TRAIN

newModel.score(selection.transform(trainData), trainTarget)

0.6193838494429122

In [28]:
#EVALUATE MODEL ON TEST

newModel.score(selection.transform(testData), testTarget)

-0.6413286125305315

In [29]:
#error percentage

((prediction - testTarget)/testTarget * 100).abs().mean()

110.72121299030194

In [33]:
#side-by-side comparison

from sklearn.metrics import median_absolute_error as scr

scr(testTarget, prediction)

293511.1875

In [31]:
from sklearn.metrics import mean_squared_log_error as scor
scor(testTarget, prediction)            #multioutput not necessary

0.7475189041324342

In [35]:
from sklearn.metrics import explained_variance_score as scorev
scorev(testTarget, prediction)            #multioutput not necessary

-0.6364511330774003