In [25]:
# Pandas is used for data manipulation and analysis
# Numpy is used to create multidimensional arrays, and to perform mathematical function

import pandas as pd
import numpy as np

# Matplotlib is used to create visualisations.  
# Seaborn sits on top of Matplotlib, allowing the creation of more visually appealing visualisations

import matplotlib.pyplot as plt
import seaborn as sns

In [26]:
kc_data = pd.read_csv("kc_house_data.csv")

In [27]:
kc_data.loc[kc_data['bedrooms'] > 20]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
15856,2402100895,6/25/2014,640000.0,33,1.75,1620,6000,1.0,0.0,0.0,...,7,1040,580.0,1947,0.0,98103,47.6878,-122.331,1330,4700


In [3]:
kc_data.drop('yr_renovated', axis=1, inplace=True)
kc_data['waterfront'].fillna(kc_data['waterfront'].median(), inplace=True)
kc_data['view'].fillna(kc_data['view'].median(), inplace=True)
kc_data['date'] = pd.to_datetime(kc_data['date'])
kc_data['sqft_basement'].replace('?', kc_data['sqft_living']-kc_data['sqft_above'], inplace=True)
kc_data['sqft_basement'] = kc_data['sqft_basement'].astype('float')
kc_data.drop(['zipcode', 'id', 'date'], axis=1, inplace=True)
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 17 columns):
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       21597 non-null float64
view             21597 non-null float64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null float64
yr_built         21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
dtypes: float64(8), int64(9)
memory usage: 2.8 MB


### Transform Data

In [5]:
cols = list(kc_data.columns.values)

for col in cols:
    kc_data[col] = (kc_data[col] - np.min(kc_data[col]))/(np.max(kc_data[col]) - np.min(kc_data[col]))

In [6]:
kc_data['price'] = kc_data['price'].map(lambda x: np.log(x+1))
kc_data['sqft_above'] = kc_data['sqft_above'].map(lambda x: np.log(x+1))
kc_data['sqft_basement'] = kc_data['sqft_basement'].map(lambda x: np.log(x+1))
kc_data['sqft_living'] = kc_data['sqft_living'].map(lambda x: np.log(x+1))
kc_data['sqft_living15'] = kc_data['sqft_living15'].map(lambda x: np.log(x+1))
kc_data['sqft_lot'] = kc_data['sqft_lot'].map(lambda x: np.log(x+1))
kc_data['sqft_lot15'] = kc_data['sqft_lot15'].map(lambda x: np.log(x+1))
kc_data['yr_built'] = kc_data['yr_built'].map(lambda x: np.log(x+1))

In [8]:
predictors = kc_data.drop(['price', 'sqft_above', 'sqft_lot15'], axis=1)
kc_data.drop(['sqft_above', 'sqft_lot15'], axis=1, inplace=True)

### Building the Model

In [9]:
## The dataset is split into the dependent variable (y), and the independent variables (X)

X = predictors
y = kc_data['price']

## Split the data into a training set (80%) and a test set (20%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
import statsmodels.api as sm

X_train = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train).fit()

model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,price,R-squared:,0.714
Model:,OLS,Adj. R-squared:,0.714
Method:,Least Squares,F-statistic:,3081.0
Date:,"Sat, 03 Aug 2019",Prob (F-statistic):,0.0
Time:,15:07:59,Log-Likelihood:,41040.0
No. Observations:,17277,AIC:,-82050.0
Df Residuals:,17262,BIC:,-81930.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0514,0.001,-37.347,0.000,-0.054,-0.049
bedrooms,-0.0997,0.008,-13.150,0.000,-0.115,-0.085
bathrooms,0.0446,0.003,14.722,0.000,0.039,0.050
sqft_living,0.2703,0.007,37.422,0.000,0.256,0.284
sqft_lot,0.0099,0.008,1.220,0.222,-0.006,0.026
floors,-0.0002,0.001,-0.147,0.883,-0.002,0.002
waterfront,0.0636,0.002,28.546,0.000,0.059,0.068
view,0.0243,0.001,23.072,0.000,0.022,0.026
condition,0.0145,0.001,12.655,0.000,0.012,0.017

0,1,2,3
Omnibus:,10858.068,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,345184.047
Skew:,2.517,Prob(JB):,0.0
Kurtosis:,24.311,Cond. No.,82.8


### Evaluating the Model

In [17]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn import metrics

X_train_no_const = X_train.drop('const', axis=1)

regressor = LinearRegression()
regressor.fit(X_train_no_const, y_train)

## Create an dataframe, listing the coefficient for each predictor.  This will be used to confirm that the model is 
## identifcal, regardless of whether it is in SKLearn or Statsmodels.

coefficients = {'Label': X_train_no_const.columns, 'Coefficients':regressor.coef_ }
coeff_df = pd.DataFrame(coefficients)
coeff_df

Unnamed: 0,Label,Coefficients
0,bedrooms,-0.099663
1,bathrooms,0.044561
2,sqft_living,0.270258
3,sqft_lot,0.009875
4,floors,-0.000164
5,waterfront,0.063598
6,view,0.024274
7,condition,0.014544
8,grade,0.126656
9,sqft_basement,-0.019223


In [18]:
## To evaluate the performance of the model, calculate the house price predictions

y_pred_test = regressor.predict(X_test)

y_pred_train = regressor.predict(X_train_no_const)

In [19]:
## The difference between the actual and predicted values can be compared

print('Root Mean Squared Error (TRAIN):', np.sqrt(metrics.mean_squared_error(y_pred_train, y_train)))
print('Root Mean Squared Error (TEST):', np.sqrt(metrics.mean_squared_error(y_pred_test, y_test)))

Root Mean Squared Error (TRAIN): 0.02249814289098724
Root Mean Squared Error (TEST): 0.023057764581084585


In [20]:
Difference =  np.sqrt(metrics.mean_squared_error(y_pred_train, y_train)) - np.sqrt(metrics.mean_squared_error(y_pred_test, y_test))
Difference

-0.0005596216900973436

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,yr_built,lat,long,sqft_living15
