# Notebook for Initial Basic LR Model

In [1]:
# Standard Packages
import pandas as pd
import numpy as np

# Viz Packages
import seaborn as sns
import matplotlib.pyplot as plt

# Scipy Stats
import scipy.stats as stats 

# Statsmodel Api
import statsmodels.api as sm
from statsmodels.formula.api import ols

# SKLearn Modules
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

# Suppress future and deprecation warnings
import warnings
warnings.filterwarnings("ignore", category= FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
# Import Baseline DF for Modeling
baseline_df = pd.read_csv('../data/baseline_df.csv')
baseline_df

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,...,sqft_garage,sqft_patio,yr_built,address,lat,long,renovated,zip,month,year
0,7399300360,675000.0,4,1.0,1180,7140,1.0,0,0,0,...,0,40,1969,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052,0,98055,5,2022
1,8910500230,920000.0,5,2.5,2770,6703,1.0,0,0,1,...,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",47.711525,-122.35591,0,98133,12,2021
2,1180000275,311000.0,6,2.0,2880,6156,1.0,0,0,0,...,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.22520,0,98178,9,2021
3,1604601802,775000.0,3,3.0,2160,1400,2.0,0,0,0,...,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",47.566110,-122.29020,0,98118,12,2021
4,8562780790,592500.0,2,2.0,1120,758,2.0,0,0,1,...,550,30,2012,"2193 Northwest Talus Drive, Issaquah, Washingt...",47.532470,-122.07188,0,98027,8,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29186,7834800180,1555000.0,5,2.0,1910,4000,1.5,0,0,0,...,0,210,1921,"4673 Eastern Avenue North, Seattle, Washington...",47.664740,-122.32940,0,98103,11,2021
29187,194000695,1313000.0,3,2.0,2020,5800,2.0,0,0,0,...,0,520,2011,"4131 44th Avenue Southwest, Seattle, Washingto...",47.565610,-122.38851,0,98116,6,2021
29188,7960100080,800000.0,3,2.0,1620,3600,1.0,0,0,1,...,240,110,1995,"910 Martin Luther King Jr Way, Seattle, Washin...",47.610395,-122.29585,0,98122,5,2022
29189,2781280080,775000.0,3,2.5,2570,2889,2.0,0,0,0,...,480,100,2006,"17127 114th Avenue Southeast, Renton, Washingt...",47.449490,-122.18908,0,98055,2,2022


In [5]:
baseline_df.columns

Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'greenbelt', 'nuisance', 'view', 'condition',
       'grade', 'heat_source', 'sewer_system', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built', 'address', 'lat', 'long',
       'renovated', 'zip', 'month', 'year'],
      dtype='object')

In [6]:
# Drop id & address columns prior to running baseline model
baseline_df_trimmed = baseline_df.drop(columns = ['id', 'address', 'heat_source', 'sewer_system'])
baseline_df_trimmed

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,sqft_basement,sqft_garage,sqft_patio,yr_built,lat,long,renovated,zip,month,year
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,0,...,0,0,40,1969,47.461975,-122.19052,0,98055,5,2022
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,2,...,1570,0,240,1950,47.711525,-122.35591,0,98133,12,2021
2,311000.0,6,2.0,2880,6156,1.0,0,0,0,2,...,1580,0,0,1956,47.502045,-122.22520,0,98178,9,2021
3,775000.0,3,3.0,2160,1400,2.0,0,0,0,2,...,1070,200,270,2010,47.566110,-122.29020,0,98118,12,2021
4,592500.0,2,2.0,1120,758,2.0,0,0,1,0,...,550,550,30,2012,47.532470,-122.07188,0,98027,8,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29186,1555000.0,5,2.0,1910,4000,1.5,0,0,0,0,...,1130,0,210,1921,47.664740,-122.32940,0,98103,11,2021
29187,1313000.0,3,2.0,2020,5800,2.0,0,0,0,1,...,0,0,520,2011,47.565610,-122.38851,0,98116,6,2021
29188,800000.0,3,2.0,1620,3600,1.0,0,0,1,0,...,920,240,110,1995,47.610395,-122.29585,0,98122,5,2022
29189,775000.0,3,2.5,2570,2889,2.0,0,0,0,0,...,740,480,100,2006,47.449490,-122.18908,0,98055,2,2022


In [7]:
baseline_df_trimmed.yr_built.value_counts()

2021    1354
1968     514
1978     493
1977     484
1967     474
        ... 
1901      39
1934      38
1935      38
1902      36
1933      30
Name: yr_built, Length: 123, dtype: int64

In [14]:
baseline_df_trimmed.year.value_counts()

2021    18645
2022    10546
Name: year, dtype: int64

In [9]:
baseline_df_trimmed.renovated.value_counts()

0    27823
1     1368
Name: renovated, dtype: int64

In [10]:
baseline_df_trimmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29191 entries, 0 to 29190
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          29191 non-null  float64
 1   bedrooms       29191 non-null  int64  
 2   bathrooms      29191 non-null  float64
 3   sqft_living    29191 non-null  int64  
 4   sqft_lot       29191 non-null  int64  
 5   floors         29191 non-null  float64
 6   waterfront     29191 non-null  int64  
 7   greenbelt      29191 non-null  int64  
 8   nuisance       29191 non-null  int64  
 9   view           29191 non-null  int64  
 10  condition      29191 non-null  int64  
 11  grade          29191 non-null  int64  
 12  sqft_above     29191 non-null  int64  
 13  sqft_basement  29191 non-null  int64  
 14  sqft_garage    29191 non-null  int64  
 15  sqft_patio     29191 non-null  int64  
 16  yr_built       29191 non-null  int64  
 17  lat            29191 non-null  float64
 18  long  

In [11]:
## Declare features and target variable
X = baseline_df_trimmed.drop('price', axis=1)
y = baseline_df_trimmed['price']

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Create LR Model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [13]:
# Baseline Model Results
r2_score = metrics.r2_score(y_test, y_pred)
print("R-squared score:", r2_score)

R-squared score: 0.6020785917606606


In [None]:
def regression_results(y_true, y_pred):
    
    # Compute regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    
    # Display formatted metrics
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
# # Split our preprocessed data into our features and target arrays
# y = application_df["IS_SUCCESSFUL"].values
# X = application_df.drop(["IS_SUCCESSFUL"],1).values

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# # Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)