# Model Notebook 

## Business & Data Understanding
#### Revisiting our end goals with sombe EDA knowledge
- Stakeholder is a real estate agency.
- We want to create a tool for a real estate agency to estimate sales or purchase prices given housing info.
- This can be done with a regression model.

## Loading packages, libraries, functions and variables from the EDA notebook.

In [129]:
#Loading the needed packages, libraries, functions and variables from the EDA notebook.
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#Original DataFrame
%store -r df_original

In [3]:
#Cleaned DataFrame — from the EDA notebook
%store -r df_clean

In [4]:
# For consistent randomness
np.random.seed(42)

## Modeling

In [43]:
lr = LinearRegression()

In [44]:
# Our model needs to have only numeric variables.
# Using this function, we can drop all columns without numeric varibales.
# We will input this function within our next function.
def only_numeric(data):
    '''returns a dataframe with only numeric values'''
    for column in data.columns:
        if is_numeric_dtype(data[column]) == False:
            data = data.drop(column, axis=1)
        else:
            continue
    return data

In [45]:
# This returns our y and X for any data frame. 
# Uses all the numeric columns, need to pass a string as a target variable.
def get_y_X(data, target):
    data = only_numeric(data) # Making data only columns with numeric values.
    y = data[target] 
    X = data.drop(target, axis=1)
    return y, X

In [158]:
# This function will return a train / test split variables for an X and y. 
def my_train_test(ys, Xs):
    X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=.2)

    return X_train, X_test, y_train, y_test

In [47]:
# This prediction function is not in effect, work in progress. 
def prediction(ys, Xs):
    y_hat = lr.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_hat))
    return rmse, y_hat

In [139]:
# Function to compare R2 values and RMSE values of the train and testing models
def train_test_compare(X_tr, X_te, y_tr, y_te):
    model = lr.fit(X_tr, y_tr) # fit the model
    
    #R2 Scores
    train_score = lr.score(X_tr, y_tr)
    test_score = lr.score(X_te, y_te)
    
    #RMSE
    y_hat_train = lr.predict(X_tr)
    y_hat_test = lr.predict(X_te)
    
    train_rmse = np.sqrt(mean_squared_error(y_tr, y_hat_train))
    test_rmse = np.sqrt(mean_squared_error(y_te, y_hat_test))
    
    #intercept / coef
    inter = model.intercept_
    stats = sm.OLS(y_tr, sm.add_constant(X_tr)).fit()
    summary = stats.summary()
    
    return print(f' training data R2: {train_score}\n testing data R2: {test_score} \
                    \n training data rmse: {train_rmse}\n testing data rmse: {test_rmse} \
                    \n {summary}') 

In [140]:
# I am not sure if I am doing the RMSE correctly, but I am pretty confident with the R2

## Model with Test Data

In [141]:
# Using test data to demonstrate
test_data = df_clean.loc[:,['price', 'bedrooms', 'condition', 'sqft_living']]

In [163]:
y, X = get_y_X(test_data, 'price')

X_train, X_test, y_train, y_test = my_train_test(y, X)

train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.5231126196574214
 testing data R2: 0.4724396980832275                     
 training data rmse: 257532.38330153355
 testing data rmse: 249942.04629419878                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.523
Model:                            OLS   Adj. R-squared:                  0.523
Method:                 Least Squares   F-statistic:                     6316.
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:27:32   Log-Likelihood:            -2.3977e+05
No. Observations:               17277   AIC:                         4.795e+05
Df Residuals:                   17273   BIC:                         4.796e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
             

## Model on with Entire Clean DF 

In [143]:
y, X = get_y_X(df_clean, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.6663941593831695
 testing data R2: 0.6477657283716689                     
 training data rmse: 210699.34260309458
 testing data rmse: 224014.03695456276                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.666
Model:                            OLS   Adj. R-squared:                  0.666
Method:                 Least Squares   F-statistic:                     2299.
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:21:49   Log-Likelihood:            -2.3630e+05
No. Observations:               17277   AIC:                         4.726e+05
Df Residuals:                   17261   BIC:                         4.728e+05
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
             

## Model with Dummy Variables for Categories

In [144]:
df_clean_dumm = df_clean.copy()

In [145]:
# Get dummies
zipcode_dummies = pd.get_dummies(df_clean_dumm['zipcode'], drop_first=True)
waterfront_dummies = pd.get_dummies(df_clean_dumm['waterfront'], drop_first=True)
view_dummies = pd.get_dummies(df_clean_dumm['view'], drop_first=True)
month_dummies = pd.get_dummies(df_clean_dumm['month'], drop_first=True)

df_clean_dumm = pd.concat([df_clean_dumm, waterfront_dummies, 
                           view_dummies, month_dummies, zipcode_dummies], axis=1)

In [146]:
y, X = get_y_X(df_clean_dumm, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8097974653796961
 testing data R2: 0.7912977019595104                     
 training data rmse: 159094.258878814
 testing data rmse: 172434.05580196824                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.809
Method:                 Least Squares   F-statistic:                     746.3
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:21:51   Log-Likelihood:            -2.3145e+05
No. Observations:               17277   AIC:                         4.631e+05
Df Residuals:                   17178   BIC:                         4.639e+05
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
               

## Model with non-Luxury houses w/ Dummy Variabls

In [147]:
non_lux = df_clean_dumm.copy()

In [148]:
non_lux = non_lux[non_lux['price'] < 1000000]

In [149]:
y, X = get_y_X(non_lux, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8295950309548932
 testing data R2: 0.8323878662522083                     
 training data rmse: 80399.3338899496
 testing data rmse: 81212.84690006706                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.829
Method:                 Least Squares   F-statistic:                     794.1
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:21:52   Log-Likelihood:            -2.0450e+05
No. Observations:               16085   AIC:                         4.092e+05
Df Residuals:                   15986   BIC:                         4.100e+05
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
                

In [150]:
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.83
Model:,OLS,Adj. R-squared:,0.829
Method:,Least Squares,F-statistic:,794.1
Date:,"Mon, 03 Jan 2022",Prob (F-statistic):,0.0
Time:,20:21:52,Log-Likelihood:,-204500.0
No. Observations:,16085,AIC:,409200.0
Df Residuals:,15986,BIC:,410000.0
Df Model:,98,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bedrooms,-4096.8721,966.096,-4.241,0.000,-5990.529,-2203.215
bathrooms,8424.6543,1576.265,5.345,0.000,5334.997,1.15e+04
sqft_living,87.2110,1.791,48.681,0.000,83.699,90.723
sqft_lot,0.2678,0.023,11.766,0.000,0.223,0.312
floors,5208.9857,1633.869,3.188,0.001,2006.419,8411.552
condition,2.196e+04,1124.654,19.523,0.000,1.98e+04,2.42e+04
grade,4.556e+04,1087.184,41.906,0.000,4.34e+04,4.77e+04
zipcode,-154.5246,35.723,-4.326,0.000,-224.546,-84.503
lat,2.056e+05,3.67e+04,5.596,0.000,1.34e+05,2.78e+05

0,1,2,3
Omnibus:,1171.213,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4227.867
Skew:,0.312,Prob(JB):,0.0
Kurtosis:,5.433,Cond. No.,1.97e+18


## Model with non-Luxury and non-Cheap houses w/ Dummy Variabls

In [151]:
no_lux_cheap = non_lux.copy()

In [152]:
no_lux_cheap = no_lux_cheap[no_lux_cheap['price'] > 100000]

In [153]:
y, X = get_y_X(no_lux_cheap, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8297100760101848
 testing data R2: 0.8285437328637748                     
 training data rmse: 80241.57244477166
 testing data rmse: 81852.77884535793                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.829
Method:                 Least Squares   F-statistic:                     793.6
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:21:53   Log-Likelihood:            -2.0416e+05
No. Observations:               16061   AIC:                         4.085e+05
Df Residuals:                   15962   BIC:                         4.093e+05
Df Model:                          98                                         
Covariance Type:            nonrobust                                         
               

## Model with non-Luxury houses w/ Dummy Variables - drop recurring columns

In [155]:
non_lux_drop = non_lux.copy()
non_lux_drop = non_lux_drop.drop(['lat', 'long', 'sqft_lot15', 'month', 'waterfront', 'zipcode', 'view'], axis=1)

In [156]:
y, X = get_y_X(non_lux_drop, 'price')
X_train, X_test, y_train, y_test = my_train_test(y, X)
train_test_compare(X_train, X_test, y_train, y_test)

 training data R2: 0.8292192492803281
 testing data R2: 0.832744941030437                     
 training data rmse: 80487.93447015196
 testing data rmse: 81126.29433446423                     
                             OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.829
Model:                            OLS   Adj. R-squared:                  0.828
Method:                 Least Squares   F-statistic:                     817.2
Date:                Mon, 03 Jan 2022   Prob (F-statistic):               0.00
Time:                        20:21:54   Log-Likelihood:            -2.0452e+05
No. Observations:               16085   AIC:                         4.092e+05
Df Residuals:                   15989   BIC:                         4.100e+05
Df Model:                          95                                         
Covariance Type:            nonrobust                                         
                

## Visualize Train vs. Test

In [115]:
# I want to make another function to visualize both the training and test, not sure if possible

In [33]:
# plt.scatter(X_test, y_test, color="black")
# plt.plot(X_test, y_pred, color="blue", linewidth=3);

# Other

In [None]:
df_clean.info()

In [None]:
df_clean.corr().abs()['price'].sort_values()

high_corr_cols = ['sqft_living', 'sqft_above', 'sqft_living15', 'bathrooms', 'sqft_basement', 'bedrooms']

In [None]:
y = df_clean['price']
X = df_clean
    
reg = LinearRegression().fit(X, y)

plt.scatter(X, y, color='green')
plt.plot(X, reg.predict(X))
plt.xlabel('sqft_living')
plt.ylabel('Price');

In [None]:
for x in high_corr_cols:
    y = df_clean['price']
    X = df_clean[x]
    
    reg = LinearRegression().fit(X, y)

    plt.scatter(X, y, color='green')
    plt.plot(X, reg.predict(X))
    plt.xlabel(x)
    plt.ylabel('Price');