In [240]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats
from yellowbrick.regressor import ResidualsPlot, PredictionError

import math

## Read in Pickle

In [241]:
cleaned_KC = pd.read_pickle('./data/cleaned_KC.pkl')

# Scale price feature through logarithmic transformations

In [242]:
cleaned_KC.price = cleaned_KC.price.map(lambda x: np.log(x))

In [243]:
# Created a new column that calculates how much square feet exists for each bedroom
cleaned_KC['sqft_living/bedroom'] = (cleaned_KC['sqft_living']/cleaned_KC['bedrooms']).round(2)

# Round the number of bathrooms to the nearest integers to reduce noise when creating dummy variables
cleaned_KC['bathrooms'] = cleaned_KC['bathrooms'].map(lambda x : np.where(x - math.floor(x) >= .5,math.ceil(x),math.floor(x))) 

# Drop all columns not relevant to our further analysis
cleaned_KC = cleaned_KC.drop(axis=1, columns=['id','date','zipcode','yr_built','grade','condition','view','floors','sqft_lot','sqft_living15','sqft_lot15'])
cleaned_KC.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_living/bedroom
0,12.309982,3,1,1180,393.33
1,13.195614,3,2,2570,856.67
2,12.100712,2,1,770,385.0
3,13.311329,4,3,1960,490.0
4,13.142166,3,2,1680,560.0


In [244]:
cleaned_KC.corr()['price'].sort_values(ascending=False)

price                  1.000000
sqft_living            0.681139
sqft_living/bedroom    0.568913
bathrooms              0.488029
bedrooms               0.336206
Name: price, dtype: float64

In [245]:
categorial_columns = ['bathrooms','bedrooms']
non_categorial_columns = ['price','sqft_living', 'sqft_living/bedroom']

# This replaces categorical columns with dummy variables, and drops non categorical columns
dummies_df = pd.get_dummies(cleaned_KC, columns=categorial_columns).drop(columns=['price'])
dummies_df

Unnamed: 0,sqft_living,sqft_living/bedroom,bathrooms_1,bathrooms_2,bathrooms_3,bathrooms_4,bedrooms_1,bedrooms_2,bedrooms_3,bedrooms_4,bedrooms_5,bedrooms_6
0,1180,393.33,1,0,0,0,0,0,1,0,0,0
1,2570,856.67,0,1,0,0,0,0,1,0,0,0
2,770,385.00,1,0,0,0,0,1,0,0,0,0
3,1960,490.00,0,0,1,0,0,0,0,1,0,0
4,1680,560.00,0,1,0,0,0,0,1,0,0,0
5,1715,571.67,0,1,0,0,0,0,1,0,0,0
6,1780,593.33,1,0,0,0,0,0,1,0,0,0
7,1890,630.00,0,0,1,0,0,0,1,0,0,0
8,3560,1186.67,0,0,1,0,0,0,1,0,0,0
9,1160,580.00,1,0,0,0,0,1,0,0,0,0


We are going to run a for loop through each feature to run a multiple regression tests and compare rsquared values

In [246]:
for column in dummies_df.columns:


    # This sets X to a dataframe consisting of columns with dummy variables & without extra columnsmap
    X = dummies_df[column].values.reshape(-1, 1)

    # this simply sets y to the price column of the dataframe
    y = cleaned_KC['price'].values.reshape(-1, 1)

    # Creates a Linear Regression object
    lin_reg = LinearRegression()

    # Train the Regression object to fit a line with train data
    lin_reg.fit(X, y)

    # using that line, predict y values against x_test values.
    y_pred = lin_reg.predict(X)

    # Use sklearn.metrics.r2_score to determine r-squared based on y_test values and y_pred values.
    r_squared = r2_score(y, y_pred).round(3)
    print('r-squared value for {}: {}'.format(column,r_squared))

r-squared value for sqft_living: 0.464
r-squared value for sqft_living/bedroom: 0.324
r-squared value for bathrooms_1: 0.121
r-squared value for bathrooms_2: 0.016
r-squared value for bathrooms_3: 0.07
r-squared value for bathrooms_4: 0.1
r-squared value for bedrooms_1: 0.008
r-squared value for bedrooms_2: 0.032
r-squared value for bedrooms_3: 0.033
r-squared value for bedrooms_4: 0.055
r-squared value for bedrooms_5: 0.027
r-squared value for bedrooms_6: 0.004


Having 2 bathrooms is the least predictive compared to the other amount of bathrooms

In [247]:
# This sets X to a dataframe consisting of columns with dummy variables & without extra columnsmap
X = dummies_df

# this simply sets y to the price column of the dataframe
y = cleaned_KC['price']

# This runs a train/test split with .20 of the data. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=40)

### Calculate OLS

# Simply adding a y-intercept column.
X2 = sm.add_constant(X_train)

# Run an Ordinary Least Squares model using the array of y values, and the dataframe of x_train values and fit a line to it. 
est = sm.OLS(y_train, X2).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.476
Model:                            OLS   Adj. R-squared:                  0.475
Method:                 Least Squares   F-statistic:                     1545.
Date:                Tue, 01 Oct 2019   Prob (F-statistic):               0.00
Time:                        17:50:53   Log-Likelihood:                -7296.4
No. Observations:               17040   AIC:                         1.461e+04
Df Residuals:                   17029   BIC:                         1.470e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   8.5876    

  return ptp(axis=axis, out=out, **kwargs)
