In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [37]:
# DEFINE THE DATASET
sales_data = pd.read_csv('..\\data\\sales-data-clean.csv', index_col='ADDRESS')
sales_data = sales_data.select_dtypes(include=[np.number])
sales_data = sales_data.dropna(axis=0)

# drop columns by Name
#sales_data = sales_data.drop(['LIST_PRICE', 'DOM'], axis=1)

# define X and y
#X = sales_data[['LIST_PRICE', 'TAX_AMT', 'ASS_AMT', 'SIZE_SQFT', 'LOT_DPTH', 'BDRMS', 'LON', 'LAT']]
X = sales_data.loc[:, sales_data.columns != 'SELL_PRICE']  #independent columns

y = sales_data.SELL_PRICE   #target column i.e. price range

# split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.2, random_state=1)

print("Preprocessing complete.")

Preprocessing complete.


In [38]:
# DEFINING THE MODEL(S)
# Specify Model
prediction_model = DecisionTreeRegressor(random_state=1)
# Fit Model
prediction_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = prediction_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
prediction_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
prediction_model.fit(train_X, train_y)
val_predictions = prediction_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the random forest model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
# fit your model
rf_model.fit(train_X, train_y)

preds = rf_model.predict(val_X)
# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(val_y, preds)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE when not specifying max_leaf_nodes: 67,875
Validation MAE for best value of max_leaf_nodes: 70,158
Validation MAE for Random Forest Model: 41,930


In [39]:
#CALCULATING PERMUTATION IMPORTANCE
import numpy as np
import eli5
from eli5.sklearn import PermutationImportance

feature_names = [i for i in sales_data.columns if sales_data[i].dtype in [np.int64]]

perm = PermutationImportance(rf_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


Weight,Feature
0.5200  ± 0.3564,LIST_PRICE
0.0864  ± 0.0623,TAX_AMT
0.0649  ± 0.0315,SIZE_SQFT
0.0354  ± 0.0386,ASS_AMT
0.0141  ± 0.0177,LOT_FRNT
0.0052  ± 0.0125,LON
0.0033  ± 0.0035,AGE
0.0017  ± 0.0035,BDRMS
0.0002  ± 0.0009,BDRMS+
-0.0002  ± 0.0004,ASS_YR


In [40]:
#DEALING WITH MISSING VALUES
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

# Imputation removed column names; put them back
imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

# Function for comparing different approaches
def score_dataset(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y, preds)

imputed_MAE = score_dataset(imputed_train_X, imputed_val_X, train_y, val_y)
print("Validation MAE from imputation: {:,.0f}".format(imputed_MAE))

Validation MAE from imputation: 41,930


In [41]:
# APPLYING CROSS-VALIDATION USING K-FOLDS
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(rf_model, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

print("Average MAE score (across experiments): {:,.0f}".format(scores.mean()))
 

MAE scores:
 [115488.5         51898.75        88618.25        48779.27272727
  45468.45454545]
Average MAE score (across experiments): 70,051


In [42]:
#USING XGBOOST
from xgboost import XGBRegressor

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(train_X, train_y, 
             early_stopping_rounds=5, 
             eval_set=[(val_X, val_y)],
             verbose=False)

predictions = my_model.predict(val_X)

print("Mean Absolute Error: " + str(mean_absolute_error(predictions, val_y)))


Mean Absolute Error: 39474.833333333336
