In [71]:
import pprint

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pp = pprint.PrettyPrinter(indent=4)

In [72]:
sales = pd.read_csv('home_data.csv')
sales.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


### Most expensive district

In [73]:
mean_price_by_zipcode = sales[['zipcode', 'price']].groupby('zipcode').mean()
most_expensive_zipcode = mean_price_by_zipcode['price'].argmax()
mean_price_in_most_expensive_zipcode = mean_price_by_zipcode.ix[most_expensive_zipcode]['price']
print("The most expensive district is {zipcode}. The mean sales price there is {price}".format(
        zipcode=most_expensive_zipcode, 
        price=mean_price_in_most_expensive_zipcode
    )
)

The most expensive district is 98039. The mean sales price there is 2160606


### Fraction of houses between 2000 and 4000 sqft

In [74]:
sales_between_2000_and_4000 = sales[(sales['sqft_living'] > 2000) & (sales['sqft_living'] <= 4000)]
sales_between_2000_and_4000_count = len(sales_between_2000_and_4000)
print("There is {count} houses between 2000 and 4000 sqft. It represents a fraction of {fraction}".format(
        count=sales_between_2000_and_4000_count,
        fraction=round(sales_between_2000_and_4000_count / len(sales), 2)
    )
)

There is 9118 houses between 2000 and 4000 sqft. It represents a fraction of 0.42


## Build the regression model

### Select features

In [75]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15' # average lot size of 15 nearest neighbors 
]

my_features_sales = sales[my_features]
advanced_features_sales = sales[advanced_features]
prices = sales['price']

### Build train and test datasets

In [76]:
from sklearn.cross_validation import train_test_split
import sklearn.metrics as metrics

my_features_sales_train, my_features_sales_test, my_features_prices_train, my_features_prices_test = train_test_split(my_features_sales, prices, random_state=0, test_size=0.2)
advanced_features_sales_train, advanced_features_sales_test, advanced_features_prices_train, advanced_features_prices_test = train_test_split(advanced_features_sales, prices.copy(), random_state=0, test_size=0.2)

### Linear regression

In [83]:
from sklearn.linear_model import LinearRegression

my_features_linreg = LinearRegression()
advanced_features_linreg = LinearRegression()

# Fit the model to the training data
my_features_linreg.fit(my_features_sales_train, my_features_prices_train)
advanced_features_linreg.fit(advanced_features_sales_train, advanced_features_prices_train)

# Predict prices from the testset
my_features_prices_pred = my_features_linreg.predict(my_features_sales_test)
advanced_features_prices_pred = advanced_features_linreg.predict(advanced_features_sales_test)

# Calculate RMSE
my_features_rmse = np.sqrt(metrics.mean_squared_error(my_features_prices_test, my_features_prices_pred))
advanced_features_rmse = np.sqrt(metrics.mean_squared_error(advanced_features_prices_test, advanced_features_prices_pred))

print("The Root Mean Squared Error of our my features model is {}".format(my_features_rmse))
print("The Root Mean Squared Error of our advanced features model is {}".format(advanced_features_rmse))

The Root Mean Squared Error of our my features model is 244004.77443104377
The Root Mean Squared Error of our advanced features model is 190473.3757096679


In [82]:
print("The difference of RMSE between the 2 models is {}".format(int(my_features_rmse - advanced_features_rmse)))

The difference of RMSE between the 2 models is 53531
