In [1]:
# Pandas is used for data manipulation and analysis
# Numpy is used to create multidimensional arrays, and to perform mathematical function

import pandas as pd
import numpy as np

# Matplotlib is used to create visualisations.  
# Seaborn sits on top of Matplotlib, allowing the creation of more visually appealing visualisations

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
kc_data = pd.read_csv("kc_house_data.csv")

In [3]:
kc_data.drop('yr_renovated', axis=1, inplace=True)
kc_data['waterfront'].fillna(kc_data['waterfront'].median(), inplace=True)
kc_data['view'].fillna(kc_data['view'].median(), inplace=True)
kc_data['date'] = pd.to_datetime(kc_data['date'])
kc_data['sqft_basement'].replace('?', kc_data['sqft_living']-kc_data['sqft_above'], inplace=True)
kc_data['sqft_basement'] = kc_data['sqft_basement'].astype('float')
kc_data.drop(['zipcode', 'id', 'date'], axis=1, inplace=True)
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 17 columns):
price            21597 non-null float64
bedrooms         21597 non-null int64
bathrooms        21597 non-null float64
sqft_living      21597 non-null int64
sqft_lot         21597 non-null int64
floors           21597 non-null float64
waterfront       21597 non-null float64
view             21597 non-null float64
condition        21597 non-null int64
grade            21597 non-null int64
sqft_above       21597 non-null int64
sqft_basement    21597 non-null float64
yr_built         21597 non-null int64
lat              21597 non-null float64
long             21597 non-null float64
sqft_living15    21597 non-null int64
sqft_lot15       21597 non-null int64
dtypes: float64(8), int64(9)
memory usage: 2.8 MB


### Remove Outliers

In [4]:
cols = ['bathrooms', 'bedrooms', 'condition', 'price', 'sqft_living', 'sqft_basement', 'sqft_lot', 'sqft_lot15', 'lat', 'long']

In [5]:
for col in cols:
        Q1 = np.quantile(kc_data[col], 0.25)
        Q3 = np.quantile(kc_data[col], 0.75)
        IQR = Q3 - Q1
        drop_col = kc_data.loc[(kc_data[col] < Q1 - 1.5*IQR) | (kc_data[col] > Q3 + 1.5*IQR)].index
        kc_data.drop(drop_col, inplace=True) 

In [6]:
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16408 entries, 0 to 21596
Data columns (total 17 columns):
price            16408 non-null float64
bedrooms         16408 non-null int64
bathrooms        16408 non-null float64
sqft_living      16408 non-null int64
sqft_lot         16408 non-null int64
floors           16408 non-null float64
waterfront       16408 non-null float64
view             16408 non-null float64
condition        16408 non-null int64
grade            16408 non-null int64
sqft_above       16408 non-null int64
sqft_basement    16408 non-null float64
yr_built         16408 non-null int64
lat              16408 non-null float64
long             16408 non-null float64
sqft_living15    16408 non-null int64
sqft_lot15       16408 non-null int64
dtypes: float64(8), int64(9)
memory usage: 2.3 MB


### Transform Data

In [7]:
cols = list(kc_data.columns.values)

for col in cols:
    kc_data[col] = (kc_data[col] - np.min(kc_data[col]))/(np.max(kc_data[col]) - np.min(kc_data[col]))

In [8]:
kc_data['price'] = kc_data['price'].map(lambda x: np.log(x+1))
kc_data['sqft_above'] = kc_data['sqft_above'].map(lambda x: np.log(x+1))
kc_data['sqft_basement'] = kc_data['sqft_basement'].map(lambda x: np.log(x+1))
kc_data['sqft_living'] = kc_data['sqft_living'].map(lambda x: np.log(x+1))
kc_data['sqft_living15'] = kc_data['sqft_living15'].map(lambda x: np.log(x+1))
kc_data['sqft_lot'] = kc_data['sqft_lot'].map(lambda x: np.log(x+1))
kc_data['sqft_lot15'] = kc_data['sqft_lot15'].map(lambda x: np.log(x+1))
kc_data['yr_built'] = kc_data['yr_built'].map(lambda x: np.log(x+1))

In [9]:
predictors = kc_data.drop(['price', 'sqft_above', 'sqft_lot15'], axis=1)
kc_data.drop(['sqft_above', 'sqft_lot15'], axis=1, inplace=True)

### Building the Model

In [10]:
## The dataset is split into the dependent variable (y), and the independent variables (X)

X = predictors
y = kc_data['price']

## Split the data into a training set (80%) and a test set (20%)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
import statsmodels.api as sm

X_train = sm.add_constant(X_train)

model = sm.OLS(y_train, X_train).fit()

model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,price,R-squared:,0.707
Model:,OLS,Adj. R-squared:,0.707
Method:,Least Squares,F-statistic:,2261.0
Date:,"Sat, 03 Aug 2019",Prob (F-statistic):,0.0
Time:,15:21:04,Log-Likelihood:,15583.0
No. Observations:,13126,AIC:,-31140.0
Df Residuals:,13111,BIC:,-31020.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0766,0.004,-17.825,0.000,-0.085,-0.068
bedrooms,-0.0150,0.003,-4.582,0.000,-0.021,-0.009
bathrooms,0.0384,0.005,8.313,0.000,0.029,0.047
sqft_living,0.2947,0.011,27.302,0.000,0.274,0.316
sqft_lot,-0.1164,0.006,-18.503,0.000,-0.129,-0.104
floors,0.0121,0.005,2.469,0.014,0.002,0.022
waterfront,0.1588,0.023,6.952,0.000,0.114,0.204
view,0.0803,0.005,15.557,0.000,0.070,0.090
condition,0.0565,0.003,17.048,0.000,0.050,0.063

0,1,2,3
Omnibus:,572.855,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,848.887
Skew:,0.409,Prob(JB):,4.64e-185
Kurtosis:,3.94,Cond. No.,61.3


### Evaluating the Model

In [12]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn import metrics

X_train_no_const = X_train.drop('const', axis=1)

regressor = LinearRegression()
regressor.fit(X_train_no_const, y_train)

## Create an dataframe, listing the coefficient for each predictor.  This will be used to confirm that the model is 
## identifcal, regardless of whether it is in SKLearn or Statsmodels.

coefficients = {'Label': X_train_no_const.columns, 'Coefficients':regressor.coef_ }
coeff_df = pd.DataFrame(coefficients)
coeff_df

Unnamed: 0,Label,Coefficients
0,bedrooms,-0.014988
1,bathrooms,0.038431
2,sqft_living,0.294687
3,sqft_lot,-0.116408
4,floors,0.012065
5,waterfront,0.158757
6,view,0.080324
7,condition,0.056546
8,grade,0.375786
9,sqft_basement,-0.007387


In [13]:
y_pred_test = regressor.predict(X_test)

y_pred_train = regressor.predict(X_train_no_const)

In [14]:
print('Root Mean Squared Error (TRAIN):', np.sqrt(metrics.mean_squared_error(y_pred_train, y_train)))
print('Root Mean Squared Error (TEST):', np.sqrt(metrics.mean_squared_error(y_pred_test, y_test)))

Root Mean Squared Error (TRAIN): 0.07381834040531864
Root Mean Squared Error (TEST): 0.07487455450077328


In [15]:
Difference =  np.sqrt(metrics.mean_squared_error(y_pred_train, y_train)) - np.sqrt(metrics.mean_squared_error(y_pred_test, y_test))
Difference

-0.001056214095454644

In [None]:
-0.0005596216900973436

In [18]:
data = np.array([5, 4, 6, 3, 7, 2, 6, 2, 5, 7, 3, 999, 2, 3])
np.mean(data)


75.28571428571429

In [19]:
data_no_outlier = np.array([5, 4, 6, 3, 7, 2, 6, 2, 5, 7, 3, 2, 3])
np.mean(data_no_outlier)

4.230769230769231

In [20]:
l = [5, 4, 6, 3, 7, 2, 6, 2, 5, 7, 3, 2, 3]

In [23]:
sum(l)

55