# House Sales in King County
#### Build linear regression model using scikit-learn.
1. Check Variance inflation factor (VIF) again after Standardize features.
2. Build linear regression model with and without polynomial feature expansion.

### Libraries

In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import *

### Pandas and Seaborn Setting

In [5]:
# changing the max_columns value
pd.set_option("display.max_columns", None)

# set seaborn theme
sb.set_theme()

In [9]:
df = pd.read_csv('df_feature.csv')
df.head()

Unnamed: 0,price,bathrooms,grade,sqft_above,sqft_living15,zipcode,zip_encoder,grade2,zip_encoder2
0,221900.0,1.0,7,1180,1340,98178,13,49,169
1,538000.0,2.25,7,2170,1690,98125,33,49,1089
2,180000.0,1.0,6,770,2720,98028,31,36,961
3,604000.0,3.0,7,1050,1360,98136,39,49,1521
4,510000.0,2.0,8,1680,1800,98074,57,64,3249


## X and Y data

In [11]:
x_data = df.drop(['price'],axis=1)
y_data = df['price']

## StandardScaler to reduce Variance inflation factor (VIF)

In [57]:
# transform x_data using StandardScaler()
scaler = StandardScaler()
x_scaled = pd.DataFrame(scaler.fit_transform(x_data), columns=x_data.columns)

In [58]:
# import VIF function from statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [59]:
# define variables
features = ['bathrooms','grade','sqft_above', 'sqft_living15','zip_encoder']
X_scaled = x_scaled[features] 
X_data = x_data[features]

In [60]:
# Construct VIF dataframe
df_VIF = pd.DataFrame()
df_VIF['Feature'] = features

# Calculate VIF for each variables
df_VIF['VIF_Standardized'] = [variance_inflation_factor(X_scaled.values, i) for i in range(len(X_scaled.columns))]
df_VIF['VIF_Original'] = [variance_inflation_factor(X_data.values, i) for i in range(len(X_data.columns))]
df_VIF

Unnamed: 0,Feature,VIF_Standardized,VIF_Original
0,bathrooms,2.085351,17.71575
1,grade,3.09373,26.542651
2,sqft_above,3.245791,15.398978
3,sqft_living15,2.52251,23.557256
4,zip_encoder,1.189749,5.260702


### Summery
* After transfrom the features with StandardScaler(), it is confirmed that the VIF is reduced to the range between 1 to 5 compared to the original values.

## Build linear regression model

### Features

In [80]:
features = ['bathrooms','grade','sqft_above', 'sqft_living15','zip_encoder']
x_data = df[features]
y_data = df['price']

### Split train and test set

In [81]:
#  spilt
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

### Build pipeline

In [82]:
Input = [('scale',StandardScaler()), ('model',LinearRegression())]
pipe = Pipeline(Input)

### Fit

In [83]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('scale', StandardScaler()), ('model', LinearRegression())])

### Evaluation

In [84]:
prediction = pipe.predict(x_test)
score = r2_score(y_test, prediction)
rmse = mean_squared_error(y_test, prediction, squared=False)
print('R-Squared:',score)
print('RMSE:',rmse)

R-Squared: 0.5986966538145935
RMSE: 246308.18194689535


## Build linear regression model with polynomial feature expansio

### Features

In [85]:
features = ['bathrooms','grade','sqft_above', 'sqft_living15','zip_encoder', 'grade2', 'zip_encoder2']
x_data = df[features]
y_data = df['price']

### Split train and test set

In [86]:
#  spilt
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

### Build pipeline

In [87]:
Input = [('scale',StandardScaler()), ('model',LinearRegression())]
pipe = Pipeline(Input)

### Fit

In [88]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('scale', StandardScaler()), ('model', LinearRegression())])

### Evaluation

In [89]:
prediction = pipe.predict(x_test)
score = r2_score(y_test, prediction)
rmse = mean_squared_error(y_test, prediction, squared=False)
print('R-Squared:',score)
print('RMSE:',rmse)

R-Squared: 0.6682729237417213
RMSE: 223940.63667701336
