## King County House Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import utils
import models as ml

from sklearn.model_selection import train_test_split

%matplotlib inline

#### 1), prepare data 

In [2]:
df = pd.read_csv('./kc_house_data.csv')
df.head()
df_train, df_test = train_test_split(df, test_size = 0.2) 

In [3]:
df_train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
19917,2597490140,20150326T000000,825000.0,4,3.25,3040,4155,2.0,0,0,...,8,2350,690,2013,0,98029,47.5429,-122.012,2680,4000
5339,1440500020,20141226T000000,350000.0,3,1.75,1470,8645,1.0,0,0,...,6,1470,0,1949,0,98155,47.7524,-122.323,1470,7680
16409,7231600098,20141014T000000,225000.0,2,1.0,700,6000,1.0,0,0,...,6,700,0,1943,0,98055,47.4671,-122.212,1320,6000
19340,5492200090,20141007T000000,770126.0,4,2.75,2390,9300,1.0,0,0,...,8,1430,960,1979,0,98004,47.6035,-122.206,1910,9348
17426,2597650660,20141013T000000,775000.0,4,2.5,3180,15358,2.0,0,0,...,9,3180,0,1988,0,98027,47.5172,-122.053,3020,15522


In [4]:
train_data = np.array(df_train)[:,2:]
train_X = train_data[:,1:].astype(np.float32)
train_y = train_data[:,0].astype(np.float32)
train_y = train_y.reshape(train_y.shape[0], 1)

In [5]:
train_X.shape

(17290, 18)

In [6]:
train_y.shape

(17290, 1)

In [7]:
train_X[0]

array([  4.00000000e+00,   3.25000000e+00,   3.04000000e+03,
         4.15500000e+03,   2.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   3.00000000e+00,   8.00000000e+00,
         2.35000000e+03,   6.90000000e+02,   2.01300000e+03,
         0.00000000e+00,   9.80290000e+04,   4.75429001e+01,
        -1.22012001e+02,   2.68000000e+03,   4.00000000e+03], dtype=float32)

In [8]:
stds_X, means_X = ml.std_normalize(train_X)

In [9]:
std_y, mean_y = ml.std_normalize(train_y.reshape(train_y.shape[0], 1))

#### 2), train linear model

In [10]:
feature_cnt = train_X.shape[1]
sample_cnt = train_X.shape[0]

In [16]:
W, b = ml.create_parameters(feature_cnt)

# batch learning
for epoch in range(0, 10000):
    h = ml.linear_model(train_X, W, b)
    W, b = ml.gd_update(train_X, train_y, h, W, b, ml.mse_cost_dev, lr=0.01)
    if (epoch + 1) % 1000 == 0:
        cur_cost = ml.mse_cost(h, train_y)
        print('epoch: {0}, cost:{1}'.format(epoch + 1, cur_cost))

# finish
predictions = ml.linear_model(train_X, W, b)
final_cost = ml.mse_cost(predictions, train_y)
print('training finished!')
print('final cost: {0}'.format(final_cost, W, b))

epoch: 1000, cost:2694.8987165547073
epoch: 2000, cost:2591.300468687815
epoch: 3000, cost:2589.567121240845
epoch: 4000, cost:2589.531462432317
epoch: 5000, cost:2589.530707513511
epoch: 6000, cost:2589.530691427876
epoch: 7000, cost:2589.5306910842987
epoch: 8000, cost:2589.530691076952
epoch: 9000, cost:2589.530691076795
epoch: 10000, cost:2589.5306910767918
training finished!
final cost: 2589.5306910767913
