In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import seaborn as sns

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree

### Load Data

In [5]:
#url of the dataset
kc_dt_url = "https://raw.githubusercontent.com/junjiewu-mtl/glis630_final_project/master/kc_house_data.csv"

In [6]:
#read the .csv from the website
url = kc_dt_url
hp_data = pd.read_csv(url)
#hp_data.count()

In [9]:
#hp_data.duplicated().describe()

In [10]:
#converted week of year as a factor features
hp_data["date"] = hp_data["date"].str[0:8]
hp_data["formatted_date"] = pd.to_datetime(hp_data["date"])

### Data Prep

In [12]:
#drop unuse features
df1 = hp_data.drop(["id", "date", "lat", "long","formatted_date"], axis=1)

In [13]:
#covert features from numerical to factor
df1[['waterfront','zipcode','yr_built','yr_renovated']] = df1[['waterfront','zipcode','yr_built','yr_renovated']].astype('category')

In [14]:
#one-hot encoding
df1 = pd.get_dummies(df1)
# Display the first 5 rows of the last 12 columns
df1.head(5)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,sqft_above,...,zipcode_98146,zipcode_98148,zipcode_98155,zipcode_98166,zipcode_98168,zipcode_98177,zipcode_98178,zipcode_98188,zipcode_98198,zipcode_98199
0,221900.0,3,1.0,1180,5650,1.0,0,3,7,1180,...,0,0,0,0,0,0,1,0,0,0
1,538000.0,3,2.25,2570,7242,2.0,0,3,7,2170,...,0,0,0,0,0,0,0,0,0,0
2,180000.0,2,1.0,770,10000,1.0,0,3,6,770,...,0,0,0,0,0,0,0,0,0,0
3,604000.0,4,3.0,1960,5000,1.0,0,5,7,1050,...,0,0,0,0,0,0,0,0,0,0
4,510000.0,3,2.0,1680,8080,1.0,0,3,8,1680,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df1[['price']].describe().round(0)

Unnamed: 0,price
count,21613.0
mean,540088.0
std,367127.0
min,75000.0
25%,321950.0
50%,450000.0
75%,645000.0
max,7700000.0


In [16]:
#cut dataset into lower and higher price partition
cut_labels_2 = ['lower_price', 'higher_price']
cut_bins = [0, 645000, 7700000]
df1['cut_ex1'] = pd.cut(df1['price'], bins=cut_bins, labels=cut_labels_2)
df1['cut_ex1'].value_counts()

lower_price     16240
higher_price     5373
Name: cut_ex1, dtype: int64

In [17]:
#save lower and higher price dataframe
df1_lower = df1[df1['cut_ex1'] == "lower_price"]
df1_higher = df1[df1['cut_ex1'] == "higher_price"]

#drop the price label
df1_lower = df1_lower.drop(["cut_ex1"], axis=1)
df1_higher = df1_higher.drop(["cut_ex1"], axis=1)

In [18]:
#saved the lower and higher datasets
# df1_lower.to_csv('~/Desktop/kc_lower.csv') 
# df1_higher.to_csv('~/Desktop/kc_higher.csv') 

### Random Forest

In [31]:
#fit the model with lower or higher pirce house data
rf_df = df1_higher

In [32]:
# Labels are the values we want to predict
rf_labels = np.array(rf_df['price'])
# Remove the labels from the features
# axis 1 refers to the columns
rf_features= rf_df.drop('price', axis = 1)
# Saving feature names for later use
rf_feature_list = list(rf_features.columns)
# Convert to numpy array
rf_features = np.array(rf_features)
#normalize features
rf_features = rf_features / np.linalg.norm(rf_features)

In [33]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
rf_train_features, rf_test_features, rf_train_labels, rf_test_labels = train_test_split(rf_features, rf_labels, test_size = 0.1, random_state = 42)

In [34]:
# print('Training Features Shape:', rf_train_features.shape)
# print('Training Labels Shape:', rf_train_labels.shape)
# print('Testing Features Shape:', rf_test_features.shape)
# print('Testing Labels Shape:', rf_test_labels.shape)

In [35]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42, n_jobs=-1)

#record model train time
import timeit
start = timeit.default_timer()
# Train the model on training data
rf.fit(rf_train_features, rf_train_labels);
stop = timeit.default_timer()
print('Time: ', stop - start)  

#lower price
#Time:  201.8072895410005

#higher price
#Time:  57.776589996006805

Time:  50.916686346999995


In [36]:
from math import sqrt
from sklearn import metrics

# Use the forest's predict method on the test data
rf_predictions = rf.predict(rf_test_features)
# Calculate the absolute errors
rf_errors = abs(rf_predictions - rf_test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(rf_errors), 2), 'USD')

# Calculate the root mean square error
rf_rms = sqrt(metrics.mean_squared_error(rf_test_labels, rf_predictions))

print('RMSE:', round(np.mean(rf_rms), 2))

# R squared
rf_accuracy = metrics.r2_score(rf_test_labels, rf_predictions)
print('R2:', round(np.mean(rf_accuracy), 2))

# Calculate mean absolute percentage error (MAPE)
rf_mape = 100 * (rf_errors / rf_test_labels)
# Calculate and display accuracy
rf_accuracy = 100 - np.mean(rf_mape)
print('Accuracy:', round(rf_accuracy, 2), '%.')

#lower price
# Mean Absolute Error: 49938.71 USD
# RMSE: 67526.53
# R2: 0.71
# Accuracy: 85.35 %.

#higher price
# Mean Absolute Error: 137265.62 USD
# RMSE: 243635.59
# R2: 0.67
# Accuracy: 87.04 %.

Mean Absolute Error: 137265.62 USD
RMSE: 243635.59
R2: 0.67
Accuracy: 87.04 %.


In [37]:
import plotly
from plotly import version
print (version)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

<module 'plotly.version' from '/Users/junjiewu/opt/anaconda3/lib/python3.7/site-packages/plotly/version.py'>


In [40]:
import plotly
from plotly import version
print (version)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# x and y given as DataFrame columns
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=rf_test_labels, y=rf_predictions,
                         mode='markers',
                         name='price'))

fig.add_trace(go.Scatter(x=[0,5000000], y = [0,5000000],
                    mode='lines',
                    name='True Value'))
fig.update_layout(
    title_text="Real and Predictive Price Comparison - Higher Price Dataset")

fig.show()

<module 'plotly.version' from '/Users/junjiewu/opt/anaconda3/lib/python3.7/site-packages/plotly/version.py'>


### K-ford cross validation

In [487]:
from sklearn.model_selection import cross_val_score, cross_val_predict

#record model train time
import timeit
start = timeit.default_timer()
# Perform 5-fold cross validation
cv_predictions = cross_val_predict(rf, rf_features, rf_labels, cv=10, n_jobs=-1)
stop = timeit.default_timer()
print('Time: ', stop - start) 

Time:  2260.181539370009


In [488]:
from math import sqrt
from sklearn import metrics

# Calculate the absolute errors
cv_errors = abs(cv_predictions - rf_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(cv_errors), 2), 'USD')

# Calculate the root mean square error
cv_rms = sqrt(metrics.mean_squared_error(rf_labels, cv_predictions))

print('RMSE:', round(np.mean(cv_rms), 2))

# R squared
cv_accuracy = metrics.r2_score(rf_labels, cv_predictions)
print('R2:', round(np.mean(cv_accuracy), 2))

# Calculate mean absolute percentage error (MAPE)
cv_mape = 100 * (cv_errors / rf_labels)
# Calculate and display accuracy
cv_accuracy = 100 - np.mean(cv_mape)
print('Accuracy:', round(cv_accuracy, 2), '%.')

#lower
# Mean Absolute Error: 48963.77 USD
# RMSE: 65485.18
# R2: 0.73
# Accuracy: 85.87 %.

#higher
# Mean Absolute Error: 138611.48 USD
# RMSE: 245119.37
# R2: 0.74
# Accuracy: 87.47 %.

Mean Absolute Error: 48963.77 USD
RMSE: 65485.18
R2: 0.73
Accuracy: 85.87 %.


In [489]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=rf_labels, y=cv_predictions, mode='markers'))

fig.add_trace(go.Scatter(x=[1,7000000], y = [1,7000000],
                    mode='lines',
                    name='True Value'))
fig.show()

### Variable Importances

In [465]:
# Get numerical feature importances
rf_importances = list(rf.feature_importances_)
# List of tuples with variable and importance
rf_feature_importances = [(rf_feature, round(rf_importance, 4)) for rf_feature, rf_importance in zip(rf_feature_list, rf_importances)]
# Sort the feature importances by most important first
rf_feature_importances = sorted(rf_feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in rf_feature_importances];


Variable: grade                Importance: 0.2231
Variable: sqft_lot15           Importance: 0.0845
Variable: sqft_living          Importance: 0.0798
Variable: sqft_living15        Importance: 0.0721
Variable: sqft_lot             Importance: 0.0441
Variable: sqft_above           Importance: 0.0305
Variable: sqft_basement        Importance: 0.0275
Variable: zipcode_98023        Importance: 0.021
Variable: condition            Importance: 0.0161
Variable: zipcode_98042        Importance: 0.0161
Variable: zipcode_98092        Importance: 0.0136
Variable: zipcode_98001        Importance: 0.0117
Variable: zipcode_98038        Importance: 0.0113
Variable: zipcode_98033        Importance: 0.0111
Variable: zipcode_98115        Importance: 0.0108
Variable: zipcode_98008        Importance: 0.0104
Variable: zipcode_98003        Importance: 0.0095
Variable: zipcode_98052        Importance: 0.0093
Variable: zipcode_98030        Importance: 0.0092
Variable: zipcode_98031        Importance: 0.009
Va

In [232]:
x_values = list(range(len(rf_importances)))
# List of features sorted from most to least important
rf_sorted_importances = [rf_importance[1] for rf_importance in rf_feature_importances]
rf_sorted_features = [rf_importance[0] for rf_importance in rf_feature_importances]
# Cumulative importances
rf_cumulative_importances = np.cumsum(rf_sorted_importances)
# Make a line graph
# plt.plot(x_values, rf_cumulative_importances, 'g-')
# # Draw line at 95% of importance retained
# plt.hlines(y = 0.95, xmin=0, xmax=len(rf_sorted_importances), color = 'r', linestyles = 'dashed')
# # Format x ticks and labels
# plt.xticks(x_values, rf_sorted_features, rotation = 'vertical')
# # Axis labels and title
# plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');plt.rcParams['figure.figsize'] = [500, 50]

In [233]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
print('Number of features for 95% importance:', np.where(rf_cumulative_importances > 0.95)[0][0] + 1)

Number of features for 95% importance: 36


In [78]:
# Extract the names of the most important features
rf_important_feature_names = [rf_feature[0] for rf_feature in rf_feature_importances[0:46]]
# Find the columns of the most important features
rf_important_indices = [rf_feature_list.index(rf_feature) for rf_feature in rf_important_feature_names]
# Create training and testing sets with only the important features
rf_important_train_features = rf_train_features[:, rf_important_indices]
rf_important_test_features = rf_test_features[:, rf_important_indices]
# Sanity check on operations
print('Important train features shape:', rf_important_train_features.shape)
print('Important test features shape:', rf_important_test_features.shape)

Important train features shape: (19134, 46)
Important test features shape: (2126, 46)


In [80]:
#counting the runtime
start = timeit.default_timer()  

# Train the expanded model on only the important features
rf.fit(rf_important_train_features, rf_train_labels);

stop = timeit.default_timer()
print('Time: ', stop - start)

# Make predictions on test data
rf_import_predictions = rf.predict(rf_important_test_features)
# Performance metrics
rf_import_errors = abs(rf_import_predictions - rf_test_labels)
print('Average absolute error:', round(np.mean(rf_import_errors), 2), 'USD.')
# Calculate mean absolute percentage error (MAPE)
rf_import_mape = 100 * (rf_import_errors / rf_test_labels)
# Calculate and display accuracy
rf_import_accuracy = 100 - np.mean(rf_import_mape)
print('Accuracy:', round(rf_import_accuracy, 2), '%.')

from math import sqrt

# Calculate the root mean square error
rf_import_rms = sqrt(mean_squared_error(rf_test_labels, rf_import_predictions))
print('RMSE:', round(np.mean(rf_import_rms), 2), 'USD')


# Time:  207.77607357499073
# Average absolute error: 91.31 thousands USD
# Accuracy: 82.59 %


Time:  190.48574922999978
Average absolute error: 93473.31 USD.
Accuracy: 82.08 %.
RMSE: 172112.63 USD


In [84]:
# Calculate the root mean square error
rf_import_rms = sqrt(mean_squared_error(rf_test_labels, rf_import_predictions))

print('RMSE:', round(np.mean(rf_import_rms), 2))

RMSE: 172112.63


In [None]:
plt.scatter(rf_test_labels, predictions)
plt.xlabel('True Values [price]')
plt.ylabel('Predictions [pirce]')
plt.rcParams['figure.figsize'] = [15, 8]
lims = [0, 5000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
