In [1]:
# Notebook for Initial Basic LR Model

In [2]:
# Standard Packages
import pandas as pd
import numpy as np

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt

# Scipy Stats
import scipy.stats as stats 

# Statsmodel Api
import statsmodels.api as sm
from statsmodels.formula.api import ols

# SKLearn Modules
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

# Suppress future and deprecation warnings
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [13]:
# Import Baseline DF for Modeling
baseline_df = pd.read_csv('../data/baseline_df.csv')
baseline_df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long,zip
0,7399300360,2022-05-24,675000.0,4,1.0,1180,7140,1.0,0,0,...,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052,98055
1,8910500230,2021-12-13,920000.0,5,2.5,2770,6703,1.0,0,0,...,1570,1570,0,240,1950,0,"11231 Greenwood Avenue North, Seattle, Washing...",47.711525,-122.35591,98133
2,1180000275,2021-09-29,311000.0,6,2.0,2880,6156,1.0,0,0,...,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.22520,98178
3,1604601802,2021-12-14,775000.0,3,3.0,2160,1400,2.0,0,0,...,1090,1070,200,270,2010,0,"4079 Letitia Avenue South, Seattle, Washington...",47.566110,-122.29020,98118
4,8562780790,2021-08-24,592500.0,2,2.0,1120,758,2.0,0,0,...,1120,550,550,30,2012,0,"2193 Northwest Talus Drive, Issaquah, Washingt...",47.532470,-122.07188,98027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29186,7834800180,2021-11-30,1555000.0,5,2.0,1910,4000,1.5,0,0,...,1600,1130,0,210,1921,0,"4673 Eastern Avenue North, Seattle, Washington...",47.664740,-122.32940,98103
29187,194000695,2021-06-16,1313000.0,3,2.0,2020,5800,2.0,0,0,...,2020,0,0,520,2011,0,"4131 44th Avenue Southwest, Seattle, Washingto...",47.565610,-122.38851,98116
29188,7960100080,2022-05-27,800000.0,3,2.0,1620,3600,1.0,0,0,...,940,920,240,110,1995,0,"910 Martin Luther King Jr Way, Seattle, Washin...",47.610395,-122.29585,98122
29189,2781280080,2022-02-24,775000.0,3,2.5,2570,2889,2.0,0,0,...,1830,740,480,100,2006,0,"17127 114th Avenue Southeast, Renton, Washingt...",47.449490,-122.18908,98055


In [14]:
baseline_df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'greenbelt', 'nuisance', 'view',
       'condition', 'grade', 'heat_source', 'sewer_system', 'sqft_above',
       'sqft_basement', 'sqft_garage', 'sqft_patio', 'yr_built',
       'yr_renovated', 'address', 'lat', 'long', 'zip'],
      dtype='object')

In [16]:
# Drop id & address columns prior to running baseline model
baseline_df_trimmed = baseline_df.drop(columns = ['id', 'address', 'heat_source', 'sewer_system'])
baseline_df_trimmed

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,...,grade,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,lat,long,zip
0,2022-05-24,675000.0,4,1.0,1180,7140,1.0,0,0,0,...,6,1180,0,0,40,1969,0,47.461975,-122.19052,98055
1,2021-12-13,920000.0,5,2.5,2770,6703,1.0,0,0,1,...,6,1570,1570,0,240,1950,0,47.711525,-122.35591,98133
2,2021-09-29,311000.0,6,2.0,2880,6156,1.0,0,0,0,...,6,1580,1580,0,0,1956,0,47.502045,-122.22520,98178
3,2021-12-14,775000.0,3,3.0,2160,1400,2.0,0,0,0,...,8,1090,1070,200,270,2010,0,47.566110,-122.29020,98118
4,2021-08-24,592500.0,2,2.0,1120,758,2.0,0,0,1,...,6,1120,550,550,30,2012,0,47.532470,-122.07188,98027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29186,2021-11-30,1555000.0,5,2.0,1910,4000,1.5,0,0,0,...,7,1600,1130,0,210,1921,0,47.664740,-122.32940,98103
29187,2021-06-16,1313000.0,3,2.0,2020,5800,2.0,0,0,0,...,6,2020,0,0,520,2011,0,47.565610,-122.38851,98116
29188,2022-05-27,800000.0,3,2.0,1620,3600,1.0,0,0,1,...,6,940,920,240,110,1995,0,47.610395,-122.29585,98122
29189,2022-02-24,775000.0,3,2.5,2570,2889,2.0,0,0,0,...,7,1830,740,480,100,2006,0,47.449490,-122.18908,98055


In [18]:
baseline_df_trimmed.yr_built.value_counts()

2021    1354
1968     514
1978     493
1977     484
1967     474
        ... 
1901      39
1934      38
1935      38
1902      36
1933      30
Name: yr_built, Length: 123, dtype: int64

In [34]:
baseline_df_trimmed.yr_renovated.value_counts()

0       27823
2021       69
2006       49
2018       44
1990       42
        ...  
1942        1
1951        1
1948        1
1972        1
1924        1
Name: yr_renovated, Length: 82, dtype: int64

In [33]:
baseline_df_trimmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29191 entries, 0 to 29190
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           29191 non-null  object 
 1   price          29191 non-null  float64
 2   bedrooms       29191 non-null  int64  
 3   bathrooms      29191 non-null  float64
 4   sqft_living    29191 non-null  int64  
 5   sqft_lot       29191 non-null  int64  
 6   floors         29191 non-null  float64
 7   waterfront     29191 non-null  int64  
 8   greenbelt      29191 non-null  int64  
 9   nuisance       29191 non-null  int64  
 10  view           29191 non-null  int64  
 11  condition      29191 non-null  int64  
 12  grade          29191 non-null  int64  
 13  sqft_above     29191 non-null  int64  
 14  sqft_basement  29191 non-null  int64  
 15  sqft_garage    29191 non-null  int64  
 16  sqft_patio     29191 non-null  int64  
 17  yr_built       29191 non-null  int64  
 18  yr_ren

## Declare features and target variable
X = baseline_df_trimmed.drop('price', axis=1)
y = baseline_df_trimmed['price']

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create LR Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
# Baseline Model Results
r2_score = r2_score(y_test, y_pred)
print("R-squared score:", r2_score)

In [None]:
# Split our preprocessed data into our features and target arrays
y = application_df["IS_SUCCESSFUL"].values
X = application_df.drop(["IS_SUCCESSFUL"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
def regression_results(y_true, y_pred):
    
    # Compute regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    
    # Display formatted metrics
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))