In [2]:
# PREPROCESSING & DATA LOADING
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


#Path of the file to read
file_path = '..\\data\\data-clean.csv'

sales_data = pd.read_csv(file_path, index_col='ADDRESS')
sales_data = sales_data.dropna(axis=0)

#Create target object and call it y
y = sales_data.SELL_PRICE
# Create X
#features = ['BDRMS', 'SIZE_SQFT', 'TAX_AMT', 'LOT_FRNT', 'LAT', 'LON']
features = ['BDRMS','SIZE_SQFT', 'AGE', 'TAX_AMT', 'LOT_FRNT', 'LOT_DPTH', 'LAT', 'LON']
X = sales_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

print("Preprocessing complete.")


Preprocessing complete.


In [3]:
# DEFINING THE MODEL(S)
# Specify Model
prediction_model = DecisionTreeRegressor(random_state=1)
# Fit Model
prediction_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = prediction_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
prediction_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
prediction_model.fit(train_X, train_y)
val_predictions = prediction_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the random forest model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
# fit your model
rf_model.fit(train_X, train_y)

preds = rf_model.predict(val_X)
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, preds)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE when not specifying max_leaf_nodes: 89,180
Validation MAE for best value of max_leaf_nodes: 82,840
Validation MAE for Random Forest Model: 71,835


In [4]:
#CALCULATING PERMUTATION IMPORTANCE
import numpy as np
import eli5
from eli5.sklearn import PermutationImportance

feature_names = [i for i in sales_data.columns if sales_data[i].dtype in [np.int64]]

perm = PermutationImportance(rf_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


Weight,Feature
1.2160  ± 0.2913,TAX_AMT
0.0800  ± 0.1197,SIZE_SQFT
0.0193  ± 0.0067,LOT_FRNT
0.0086  ± 0.0117,LON
0.0007  ± 0.0049,AGE
-0.0011  ± 0.0027,BDRMS
-0.0050  ± 0.0139,LAT
-0.0058  ± 0.0046,LOT_DPTH


In [5]:
#DEALING WITH MISSING VALUES
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

# Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

# Function for comparing different approaches
def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y, preds)

imputed_MAE = score_dataset(imputed_train_X, imputed_val_X, train_y, val_y)
print("Validation MAE from imputation: {:,.0f}".format(imputed_MAE))


Validation MAE from imputation: 71,835


In [6]:
# APPLYING CROSS-VALIDATION USING K-FOLDS
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(rf_model, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

print("Average MAE score (across experiments): {:,.0f}".format(scores.mean()))
 

MAE scores:
 [130237.5        100522.58333333 104428.33333333  62409.90909091
  74701.90909091]
Average MAE score (across experiments): 94,460


In [8]:
#USING XGBOOST
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, 
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)],
             verbose=False)

predictions = my_model.predict(val_X)

print("Mean Absolute Error: " + str(mean_absolute_error(predictions, val_y)))


Mean Absolute Error: 89378.76666666666
