In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [6]:
train_data = pd.read_csv("kc_house_train_data.csv", dtype=dtype_dict)
test_data = pd.read_csv("kc_house_test_data.csv", dtype=dtype_dict)

In [7]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
train_data["bedrooms_squared"] = train_data["bedrooms"]**2
#train_data["bedrooms_squared"] = train_data["bedrooms"].apply(lambda x: x**2)
test_data["bedrooms_squared"] = test_data["bedrooms"]**2

train_data["bed_bath_rooms"] = train_data["bedrooms"]*train_data["bathrooms"]
test_data["bed_bath_rooms"] = test_data["bedrooms"]*test_data["bathrooms"]

train_data["log_sqft_living"] = np.log(train_data["sqft_living"])
test_data["log_sqft_living"] = np.log(test_data["sqft_living"])

train_data["lat_plus_long"] = train_data["lat"]+train_data["long"]
test_data["lat_plus_long"] = test_data["lat"]+test_data["long"]

In [9]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_squared,bed_bath_rooms,log_sqft_living,lat_plus_long
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,0,98178,47.5112,-122.257,1340,5650,9,3.0,7.07327,-74.7458
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,1991,98125,47.721,-122.319,1690,7639,9,6.75,7.851661,-74.598
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,0,98028,47.7379,-122.233,2720,8062,4,2.0,6.646391,-74.4951
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,0,98136,47.5208,-122.393,1360,5000,16,12.0,7.5807,-74.8722
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,0,98074,47.6168,-122.045,1800,7503,9,6.0,7.426549,-74.4282


In [53]:
print "Mean of bedrooms_squared: {0:.2f}".format(np.mean(test_data["bedrooms_squared"]))
print "Mean of bed_bath_rooms: {0:.2f}".format(np.mean(test_data["bed_bath_rooms"]))
print "Mean of log_sqft_living: {0:.2f}".format(np.mean(test_data["log_sqft_living"]))
print "Mean of lat_plus_long: {0:.2f}".format(np.mean(test_data["lat_plus_long"]))

Mean of bedrooms_squared: 12.45
Mean of bed_bath_rooms: 7.50
Mean of log_sqft_living: 7.55
Mean of lat_plus_long: -74.65


In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
model_1_features = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
model_2_features = model_1_features + ['bed_bath_rooms']
model_3_features = model_2_features + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [18]:
# Create regression object
model_1 = LinearRegression()
model_2 = LinearRegression()
model_3 = LinearRegression()

In [36]:
model_1.fit(train_data[model_1_features], train_data["price"])
model_2.fit(train_data[model_2_features], train_data["price"])
model_3.fit(train_data[model_3_features], train_data["price"])

print("Model 1:\n"+"Cooeficients: \n" +str(model_1.coef_) + "\n Intercept:\n" + str(model_1.intercept_))
print("Model 2:\n"+"Cooeficients: \n" +str(model_2.coef_) + "\n Intercept:\n" + str(model_2.intercept_))
print("Model 3:\n"+"Cooeficients: \n" +str(model_3.coef_) + "\n Intercept:\n" + str(model_3.intercept_))

Model 1:
Cooeficients: 
[  3.12258646e+02  -5.95865332e+04   1.57067421e+04   6.58619264e+05
  -3.09374351e+05]
 Intercept:
-69075726.7926
Model 2:
Cooeficients: 
[  3.06610053e+02  -1.13446368e+05  -7.14613083e+04   6.54844630e+05
  -2.94298969e+05   2.55796520e+04]
 Intercept:
-66867968.8711
Model 3:
Cooeficients: 
[  5.29422820e+02   3.45142296e+04   6.70607813e+04   5.34085611e+05
  -4.06750711e+05  -8.57050439e+03  -6.78858667e+03  -5.61831484e+05
   1.27334900e+05]
 Intercept:
-62036084.9861


In [51]:
print "RSS for model_1 on train_data: " + str(sum(( (model_1.predict(train_data[model_1_features])-train_data["price"] )**2)))
print "RSS for model_2 on train_data: " + str(sum(( (model_2.predict(train_data[model_2_features])-train_data["price"] )**2)))
print "RSS for model_3 on train_data: " + str(sum(( (model_3.predict(train_data[model_3_features])-train_data["price"] )**2)))

RSS for model_1 on train_data: 9.6787996305e+14
RSS for model_2 on train_data: 9.58419635074e+14
RSS for model_3 on train_data: 9.0343645505e+14


In [52]:
print "RSS for model_1 on test_data: " + str(sum(( (model_1.predict(test_data[model_1_features])-test_data["price"] )**2)))
print "RSS for model_2 on test_data: " + str(sum(( (model_2.predict(test_data[model_2_features])-test_data["price"] )**2)))
print "RSS for model_3 on test_data: " + str(sum(( (model_3.predict(test_data[model_3_features])-test_data["price"] )**2)))

RSS for model_1 on test_data: 2.25500469795e+14
RSS for model_2 on test_data: 2.23377462976e+14
RSS for model_3 on test_data: 2.59236319207e+14
