<a href="https://colab.research.google.com/github/jeonghojo00/HousingPricePrediction/blob/main/HousingPricePrediction_AccuracyCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
import math
import json

from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.serialize import model_to_json, model_from_json

## Step 0. Connect to Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Step 1. Define Functions

### Load Data

In [5]:
def load_data():
    colab_notebook = "/content/drive/My Drive/Colab Notebooks/"
    zillow_dataset = "Zillow/AllHomesZipPrices.csv"
    df = pd.read_csv(colab_notebook+zillow_dataset)
    return df

### Get City Dataframe

In [6]:
def get_city_df(df, state, city):
    state_df = df[df['state'] == state]
    city_df = state_df[state_df['city'] == city]

    return city_df

### Get House Price Prediction

In [51]:
def get_HousingPricePrediction(city_df, zip, period = 24):
    zip_df = city_df[city_df['zip']==zip].loc[:,('ds', 'y')]
    md = Prophet(interval_width=0.95, weekly_seasonality=True, daily_seasonality=True)
    model = md.fit(zip_df)
    future = model.make_future_dataframe(periods=period, freq='M') #freq changes frequency of dates. Default Daily
    pred_y = model.predict(future)

    return md, pred_y

### Save Model

In [8]:
def save_prophet_model(model, filename):
  colab_notebook = "/content/drive/My Drive/Colab Notebooks/models/"
  with open(colab_notebook + filename, 'w') as fout:
    json.dump(model_to_json(model), fout)

### Load Model

In [9]:
def load_prophet_model(filename):
  colab_notebook = "/content/drive/My Drive/Colab Notebooks/models/"
  with open(colab_notebook+filename, 'r') as fin:
    model = model_from_json(json.load(fin)) 
    return model

## Step 2. Get MAE and RMSE

In [52]:
df = load_data()
err_dict = dict()

### Dallas, Texas

In [53]:
##### Get Dataset for a city to run simulations
state = 'TX'
city = 'Dallas'
city_df = get_city_df(df, state, city)
city_df.loc[:,'ds'] = pd.to_datetime(city_df.loc[:,'ds'], format='%Y-%m-%d')
city_df['ds'] = city_df['ds'].dt.strftime('%Y-%m')
train_df = city_df[city_df['ds']<'2016-01']
test_df = city_df[city_df['ds']>'2015-12']

#### Prepare dictionary for errors
err_dict[city+state] = dict()
err_dict[city+state]['mae'] = list()
err_dict[city+state]['rmse'] = list()

#### Prepare zip code dictionary
zipCodes = city_df['zip'].unique()
period = len(test_df[test_df['zip']==zipCodes[0]])+1

In [54]:
for zip in zipCodes:
    zip_df = train_df[train_df['zip']==zip]
    if (zip_df.isnull().values.ravel().sum()) > (len(zip_df)*0.8):
        continue
    md, pred_y = get_HousingPricePrediction(train_df , zip, period = period)
    pred_y['ds'] = pred_y['ds'].dt.strftime('%Y-%m')
    pred_y = pred_y[['ds', 'yhat']]
    real_y = city_df[city_df['zip']==int(zip)][['ds','y']]
    merged_y = real_y.merge(pred_y, on="ds", how = 'outer')
    
    for_accuracy = merged_y[merged_y['ds']>='2016-01']
    zip_mae = mae(for_accuracy.y, for_accuracy.yhat)
    zip_rmse = math.sqrt(mse(for_accuracy.y, for_accuracy.yhat))

    err_dict[city+state]['mae'].append(zip_mae)
    err_dict[city+state]['rmse'].append(zip_rmse)

### Houston, Texas

In [55]:
##### Get Dataset for a city to run simulations
state = 'TX'
city = 'Houston'
city_df = get_city_df(df, state, city)
city_df.loc[:,'ds'] = pd.to_datetime(city_df.loc[:,'ds'], format='%Y-%m-%d')
city_df['ds'] = city_df['ds'].dt.strftime('%Y-%m')
train_df = city_df[city_df['ds']<'2016-01']
test_df = city_df[city_df['ds']>'2015-12']

#### Prepare dictionary for errors
err_dict[city+state] = dict()
err_dict[city+state]['mae'] = list()
err_dict[city+state]['rmse'] = list()

#### Prepare zip code dictionary
zipCodes = city_df['zip'].unique()
period = len(test_df[test_df['zip']==zipCodes[0]])+1

In [56]:
for zip in zipCodes:
    zip_df = train_df[train_df['zip']==zip]
    if (zip_df.isnull().values.ravel().sum()) > (len(zip_df)*0.8):
        continue
    md, pred_y = get_HousingPricePrediction(train_df , zip, period = period)
    pred_y['ds'] = pred_y['ds'].dt.strftime('%Y-%m')
    pred_y = pred_y[['ds', 'yhat']]
    real_y = city_df[city_df['zip']==int(zip)][['ds','y']]
    merged_y = real_y.merge(pred_y, on="ds", how = 'outer')
    
    for_accuracy = merged_y[merged_y['ds']>='2016-01']
    zip_mae = mae(for_accuracy.y, for_accuracy.yhat)
    zip_rmse = math.sqrt(mse(for_accuracy.y, for_accuracy.yhat))

    err_dict[city+state]['mae'].append(zip_mae)
    err_dict[city+state]['rmse'].append(zip_rmse)

### Boston, MA

In [57]:
##### Get Dataset for a city to run simulations
state = 'MA'
city = 'Boston'
city_df = get_city_df(df, state, city)
city_df.loc[:,'ds'] = pd.to_datetime(city_df.loc[:,'ds'], format='%Y-%m-%d')
city_df['ds'] = city_df['ds'].dt.strftime('%Y-%m')
train_df = city_df[city_df['ds']<'2016-01']
test_df = city_df[city_df['ds']>'2015-12']

#### Prepare dictionary for errors
err_dict[city+state] = dict()
err_dict[city+state]['mae'] = list()
err_dict[city+state]['rmse'] = list()

#### Prepare zip code dictionary
zipCodes = city_df['zip'].unique()
period = len(test_df[test_df['zip']==zipCodes[0]])+1

In [58]:
for zip in zipCodes:
    zip_df = train_df[train_df['zip']==zip]
    if (zip_df.isnull().values.ravel().sum()) > (len(zip_df)*0.8):
        continue
    md, pred_y = get_HousingPricePrediction(train_df , zip, period = period)
    pred_y['ds'] = pred_y['ds'].dt.strftime('%Y-%m')
    pred_y = pred_y[['ds', 'yhat']]
    real_y = city_df[city_df['zip']==int(zip)][['ds','y']]
    merged_y = real_y.merge(pred_y, on="ds", how = 'outer')
    
    for_accuracy = merged_y[merged_y['ds']>='2016-01']
    zip_mae = mae(for_accuracy.y, for_accuracy.yhat)
    zip_rmse = math.sqrt(mse(for_accuracy.y, for_accuracy.yhat))

    err_dict[city+state]['mae'].append(zip_mae)
    err_dict[city+state]['rmse'].append(zip_rmse)

## Step 3. Measure Accuracies in MAE and RMSE

In [64]:
acc_dict = dict()

for key in err_dict:
    acc_dict[key] = dict()
    acc_dict[key]['mae'] = round(sum(err_dict[key]['mae']) / len(err_dict[key]['mae']),2)
    acc_dict[key]['rmse'] = round(sum(err_dict[key]['rmse']) / len(err_dict[key]['rmse']),2)

In [65]:
acc_dict

{'BostonMA': {'mae': 53079.07, 'rmse': 69144.2},
 'DallasTX': {'mae': 35216.37, 'rmse': 39723.4},
 'HoustonTX': {'mae': 42608.17, 'rmse': 47569.53}}