# Fire up graphlab create

In [1]:
import graphlab

# Load some house sales data

Dataset is from house sales in King County, the region where the city of Seattle, WA is located.

In [2]:
# Columns: id, date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront
sales = graphlab.SFrame('home_data.gl/')

This non-commercial license of GraphLab Create for academic use is assigned to j.v.makin@gmail.com and will expire on October 11, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1508373282.log


# Exploring the data for housing sales 

In [3]:
# You can see that there appears to be a correlation b/w sqft_living and price
graphlab.canvas.set_target('ipynb')

# Scatter Plot
sales.show(view="Scatter Plot", x="sqft_living", y="price")

# Create a simple regression model of sqft_living to price

In [3]:
# 1. Split Data into Testing and Training
# .8 - .8 for training, .2 for testing
# set seed - means remains the same
    
train_data, test_data = sales.random_split(.8,seed=0)

In [4]:
# 2. Build Regression Model w/ Square Feet
# first param: train_data - your dataset
# target = observation
# features - if not specified, uses all
# validation?

sqft_model = graphlab.linear_regression.create(train_data, target='price', features=['sqft_living'],validation_set=None)

# Evaluate the simple model

In [5]:
# Gives maximum error and RMSE
print sqft_model.evaluate(test_data)

{'max_error': 4143550.8825285914, 'rmse': 255191.02870527367}


# Plot predictions/Look at coefficients

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Plotting multiple plots
# model.predict(test_data) - this predicts using model!
plt.plot(test_data['sqft_living'],test_data['price'],'.',
        test_data['sqft_living'],sqft_model.predict(test_data),'-')

In [6]:
# Show your coefficients
sqft_model.get('coefficients')

name,index,value,stderr
(intercept),,-47114.0206702,4923.34437753
sqft_living,,281.957850166,2.16405465323


# Explore other features in the data

To build a more elaborate model, we will explore using more features.

In [None]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
sales[my_features].show()

In [None]:
sales.show(view='BoxWhisker Plot', x='zipcode', y='price')
# Pull the bar at the bottom to view more of the data.  

# Build a regression model with more features

In [None]:
my_features_model = graphlab.linear_regression.create(train_data,target='price',features=my_features,validation_set=None)

In [None]:
print my_features

# Comparing the results of the simple model with adding more features

In [None]:
print sqft_model.evaluate(test_data)
print my_features_model.evaluate(test_data)

In [None]:
house1 = sales[sales['id']=='5309101200']
print sqft_model.predict(house1)
print my_features_model.predict(house1)
print house1['price']

In [None]:
house2 = sales[sales['id']=='1925069082']
print sqft_model.predict(house2)
print my_features_model.predict(house2)
print house2['price']

In [None]:
bill_gates = {'bedrooms':[8], 
              'bathrooms':[25], 
              'sqft_living':[50000], 
              'sqft_lot':[225000],
              'floors':[4], 
              'zipcode':['98039'], 
              'condition':[10], 
              'grade':[10],
              'waterfront':[1],
              'view':[4],
              'sqft_above':[37500],
              'sqft_basement':[12500],
              'yr_built':[1994],
              'yr_renovated':[2010],
              'lat':[47.627606],
              'long':[-122.242054],
              'sqft_living15':[5000],
              'sqft_lot15':[40000]}

print my_features_model.predict(graphlab.SFrame(bill_gates))

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d9/Bill_gates%27_house.jpg/2560px-Bill_gates%27_house.jpg">