In [2]:
import os
import json
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score, SCORERS
from sklearn.model_selection import RandomizedSearchCV
from datetime import date
import matplotlib.pyplot as plt
import pickle
import joblib 
import json

In [3]:
os.chdir('..')

# Define Functions

In [1]:
def preparePredictiveData(df):
    date1 = date(2016, 12, 31)
    df.sort_values(by = 'sale_date', inplace=True)
    train = df.query('sale_date <= @date1').copy()
    test = df.query('sale_date > @date1').copy()
    train.drop('sale_date', axis=1, inplace=True)
    test.drop('sale_date', axis=1, inplace=True)
    return train.drop(columns = ['sale_price']), test.drop(columns = ['sale_price']), train["sale_price"], test["sale_price"]

In [138]:
def predict(m, df, type_ = "P"): 
    if (type_ == "P"):
        X_train, X_test, y_train, y_test = preparePredictiveData(df)
    else: 
        X_train, X_test, y_train, y_test = train_test_split(df, test_size = 0.2)
    mod = m.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    print(r2_score(y_test, y_pred))
    y_test = y_test.to_frame()
    y_test["pred"] = y_pred
    if (m)
    return y_test

In [86]:
def get_score(model, df, type_ = "P"):
    if (type_ == "P"):
        X_train, X_test, y_train, y_test = preparePredictiveData(df, date1)
    else: 
        X_train, X_test, y_train, y_test = train_test_split(df, test_size = 0.2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    return score

def get_feature_importance(model, x, y): 
    return pd.DataFrame(model.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)

def get_coef(model, x, y): 
    return pd.DataFrame(model.coef_, index = X_train.columns, columns=['coef']).sort_values('coef',ascending=False)

In [6]:
# Add functions to test
lr = LinearRegression(normalize=True, n_jobs=-1)
rf = RandomForestRegressor(n_jobs = -1, random_state=0)

# Import Data

In [158]:

df_lr = pd.read_csv('ENG_DATA/SELECTED/05-13_lr_all.csv', index_col = [0])
df_rf = pd.read_csv('ENG_DATA/SELECTED/05-13_rf_all.csv', index_col = [0], parse_dates = ["sale_date"])
df_rf_num = df_rf.select_dtypes(exclude = 'object').copy()
dates_neighborhoods = pd.read_csv('ENG_DATA/SELECTED/05-13_sale_neighborhood_train.csv', index_col = [0], parse_dates = ["sale_date"])
df_lr = df_lr.merge(dates_neighborhoods, left_index = True, right_index = True).drop("neighborhood", axis=1)
df_output = df_rf[["sale_date", "neighborhood", "sale_price"]].query("sale_date > @date1").copy()

# Models Trained on First 8 Years in Data-Set

In [147]:
y_test_lr = predict(lr, df_lr, date1)
y_test_rf = predict(rf, df_rf_num, date1)

0.6596444597810791
0.7935263276044768


In [159]:
df_output = pd.merge(df_output, y_test_lr[['pred']],how = 'left',left_index = True, right_index = True)
df_output = pd.merge(df_output, y_test_rf[['pred']],how = 'left',left_index = True, right_index = True, suffixes = ("_lr", "_rf"))

In [160]:
df_output

Unnamed: 0,sale_date,sale_month,latitude,longitude,neighborhood,sale_price,predlr,predrf
1,2017-08-17,0.339388,0.771694,-1.988699,Outer Sunset,1525000,2.888072e+06,1809248.04
4,2018-06-21,-0.275583,1.398925,-1.985488,Sutro Heights,3310000,3.602689e+06,2399855.05
6,2018-11-28,1.261844,0.636871,-1.982277,Outer Sunset,1304000,1.546821e+06,1171401.00
8,2018-05-23,-0.583069,0.566966,-1.981185,Outer Sunset,1830000,2.024903e+06,1283810.22
12,2017-07-26,0.031902,1.309072,-1.980993,Outer Richmond,1650000,2.473569e+06,1441475.00
...,...,...,...,...,...,...,...,...
23690,2018-05-30,-0.583069,0.583196,-0.125216,Clarendon Heights,3535000,2.820850e+06,2937392.82
23693,2017-02-03,-1.505525,-0.139631,-1.432012,Parkside,1375000,9.276278e+05,1011374.30
23700,2017-10-16,0.954359,-0.390216,0.392080,Glen Park,968000,1.556506e+06,1560033.60
23712,2018-10-31,0.954359,0.757312,1.373278,Potrero Hill,1100000,2.148331e+06,1935421.02


In [161]:
r2_score(df_output["sale_price"], df_output["predlr"])

0.6596444597810791

In [162]:
r2_score( df_output["sale_price"], df_output["predrf"])

0.7935263276044768

In [163]:
df_output["pred_diff"] = df_output["predrf"] - df_output["predlr"]

# Visualize Difference in Models Performance

In [164]:
import plotly.express as px

In [4]:
fig = px.scatter(df_output, x="sale_date", y="pred_diff", trendline = "ols", title= "Difference in Prediction Over 2-Year Test Data")
fig.show()

NameError: name 'px' is not defined

In [172]:
df_melted = df_output[["sale_date", "sale_price", "predlr", "predrf"]].melt(id_vars = "sale_date")

In [178]:
fig = px.scatter(df_melted, x="sale_date", y="value", color = "variable", title= "Predictions Over 2-Year Test Data")
fig.show()

In [206]:
fig = px.scatter(df_melted.query("value < 15e6"), x="sale_date", y="value", color = "variable", title= "Predictions Over 2-Year Test Data")
fig.show()

In [196]:
df_output['sale_price_bins'] = pd.qcut(df_output['sale_price'], 10, False)

In [202]:
df_binned = df_output.groupby('sale_price_bins').mean()
df_binned.reset_index(inplace=True)
df_melted_bins = df_binned[["sale_price_bins", "sale_price", "predlr", "predrf"]].melt(id_vars = "sale_price_bins")

In [207]:
fig = px.bar(df_melted_bins, x="sale_price_bins", y="value", color = "variable", barmode='group', title= "Sale-Price Quintile Binned Bar Chart of Prediction ")
fig.show()

In [208]:
df_output['sale_date_bins'] = pd.qcut(df_output['sale_date'], 10, False)

In [210]:
df_binned_2 = df_output.groupby('sale_date_bins').mean()
df_binned_2.reset_index(inplace=True)
df_melted_bins_2 = df_binned_2[["sale_date_bins", "sale_price", "predlr", "predrf"]].melt(id_vars = "sale_date_bins")

In [211]:
fig = px.bar(df_melted_bins_2, x="sale_date_bins", y="value", color = "variable", barmode='group', title= "Time-Based Quintile Binned Bar Chart of Prediction ")
fig.show()

In [220]:
def r_squared(data):
    d = {}
    d['r2-lr'] = r2_score(data['sale_price'], data['predlr'])
    d['r2-rf'] = r2_score(data['sale_price'], data['predrf'])
    return pd.Series(d)

In [222]:
df_r2_binned = df_output.groupby('sale_date_bins').apply(r_squared)

In [225]:
df_r2_binned.reset_index(inplace=True)

In [226]:
fig = px.line(df_r2_binned.melt(id_vars = 'sale_date_bins'), x="sale_date_bins", y="value", color = "variable", title= "Time-Based Quintile Binned R-Squared Comparison")
fig.show()