# Model Notebook 

Creating a model to best predict housing prices for a real estate agency.

The function for the model gives the option of a statsmodel summary and a test visualization 

## Loading packages, libraries, functions and variables from the EDA notebook.

In [1]:
#Loading the needed packages, libraries, functions and variables from the EDA notebook.
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [2]:
#Original DataFrame
%store -r df_original

In [3]:
#Cleaned DataFrame — from the EDA notebook
%store -r df_clean

In [4]:
# For consistent randomness
np.random.seed(42)

## Modeling

In [5]:
lr = LinearRegression()

In [6]:
# Our model needs to have only numeric variables.
# Using this function, we can drop all columns without numeric varibales.
# We will input this function within our next function.
def only_numeric(data):
    '''returns a dataframe with only numeric values'''
    for column in data.columns:
        if is_numeric_dtype(data[column]) == False:
            data = data.drop(column, axis=1)
        else:
            continue
    return data

In [7]:
# This returns our y and X for any data frame. 
# Uses all the numeric columns, need to pass a string as a target variable.
def get_y_X(data, target):
    data = only_numeric(data) # Making data only columns with numeric values.
    y = data[target] 
    X = data.drop(target, axis=1)
    return y, X

In [8]:
# This function will return a train / test split variables for an X and y. 
def my_train_test(ys, Xs):
    X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=.2)

    return X_train, X_test, y_train, y_test

In [9]:
# This prediction function is not in effect, work in progress. 
def prediction(ys, Xs):
    y_hat = lr.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_hat))
    return rmse, y_hat

In [19]:
# Function to compare R2 values and RMSE values of the train and testing models
def train_test_compare(X_tr, X_te, y_tr, y_te):
    model = lr.fit(X_tr, y_tr) # fit the model
    
    #R2 Scores
    train_score = lr.score(X_tr, y_tr)
    test_score = lr.score(X_te, y_te)
    
    #RMSE
    y_hat_train = lr.predict(X_tr)
    y_hat_test = lr.predict(X_te)
    
    train_rmse = np.sqrt(mean_squared_error(y_tr, y_hat_train))
    test_rmse = np.sqrt(mean_squared_error(y_te, y_hat_test))
    
    print(f' training data R2: {train_score}\n testing data R2: {test_score} \
                    \n training data rmse: {train_rmse}\n testing data rmse: {test_rmse}')
    
    #stats model
    stats_summ = input('Do you want a statsmodel summary? (y/n)')
    if stats_summ == 'y':
        inter = model.intercept_
        stats = sm.OLS(y_tr, sm.add_constant(X_tr)).fit()
        summary = stats.summary()
        print(summary)
        
    # visualization
    viz = input('Do you want a viz of the test? (y/n)')
    if viz == 'y':
        preds = model.predict(X_te)
        fig, ax = plt.subplots()
        perfect_line = np.arange(y_test.min(), y_test.max())
        ax.plot(perfect_line, linestyle="--", color="orange", label='regression line')
        ax.scatter(y_test, preds, alpha=0.5)
        ax.set_xlabel("Predictors")
        ax.set_ylabel("Predicted Price")
        ax.legend();

    return

In [20]:
# I am not sure if I am doing the RMSE correctly, but I am pretty confident with the R2

## Model with Test Data

In [21]:
# Using test data to demonstrate
test_data = df_clean.loc[:,['price', 'bedrooms', 'condition', 'sqft_living']]

In [22]:
y, X = get_y_X(test_data, 'price')

X_train, X_test, y_train, y_test = my_train_test(y, X)

train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.5231126196574214
 testing data R2: 0.4724396980832275                     
 training data rmse: 257532.38330153355
 testing data rmse: 249942.04629419878
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)


## Model on with Entire Clean DF 

In [23]:
y, X = get_y_X(df_clean, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.6616112498182487
 testing data R2: 0.6663057291310652                     
 training data rmse: 212947.1093118701
 testing data rmse: 215160.1924757349
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)


## Model with Dummy Variables for Categories

In [24]:
df_clean_dumm = df_clean.copy()

In [25]:
# Get dummies
zipcode_dummies = pd.get_dummies(df_clean_dumm['zipcode'], drop_first=True)
waterfront_dummies = pd.get_dummies(df_clean_dumm['waterfront'], drop_first=True)
view_dummies = pd.get_dummies(df_clean_dumm['view'], drop_first=True)
month_dummies = pd.get_dummies(df_clean_dumm['month'], drop_first=True)

df_clean_dumm = pd.concat([df_clean_dumm, waterfront_dummies, 
                           view_dummies, month_dummies, zipcode_dummies], axis=1)

In [26]:
y, X = get_y_X(df_clean_dumm, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8045709270072421
 testing data R2: 0.8128858586897085                     
 training data rmse: 162808.7557475854
 testing data rmse: 157282.84300251346
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)


## Model with non-Luxury houses w/ Dummy Variabls

In [27]:
non_lux = df_clean_dumm.copy()

In [28]:
non_lux = non_lux[non_lux['price'] < 1000000]

In [29]:
y, X = get_y_X(non_lux, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8324379377688897
 testing data R2: 0.8213755034562888                     
 training data rmse: 79936.96310619239
 testing data rmse: 82982.95208673915
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)


## Model with non-Luxury and non-Cheap houses w/ Dummy Variabls

In [30]:
no_lux_cheap = non_lux.copy()

In [31]:
no_lux_cheap = no_lux_cheap[no_lux_cheap['price'] > 100000]

In [32]:
y, X = get_y_X(no_lux_cheap, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8303860053880922
 testing data R2: 0.8250601018040645                     
 training data rmse: 80300.42116759688
 testing data rmse: 81802.15603649514
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)


## Model with non-Luxury houses w/ Dummy Variables - drop recurring columns

In [33]:
non_lux_drop = non_lux.copy()
non_lux_drop = non_lux_drop.drop(['lat', 'long', 'sqft_lot15', 'month', 'waterfront', 'zipcode', 'view'], axis=1)

In [34]:
y, X = get_y_X(non_lux_drop, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8287404311570663
 testing data R2: 0.8347545522224356                     
 training data rmse: 80643.85597282492
 testing data rmse: 80478.00466188866
Do you want a statsmodel summary? (y/n)
Do you want a viz of the test? (y/n)
