In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

### Loading Data

In [3]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [4]:
train_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180.0,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340.0,5650.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570.0,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690.0,7639.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770.0,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720.0,8062.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960.0,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360.0,5000.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680.0,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800.0,7503.0


### Adding new variables into data 
- Squaring bedrooms will increase the separation between not many bedrooms (e.g. 1) and lots of bedrooms (e.g. 4) since 1^2 = 1 but 4^2 = 16. Consequently this variable will mostly affect houses with many bedrooms.
- Bedrooms times bathrooms is what's called an "interaction" variable. It is large when both of them are large.
- Taking the log of square feet has the effect of bringing large values closer together and spreading out small values.
- Adding latitude to longitude is non-sensical but we will do it anyway (you'll see why)

In [5]:
train_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
train_data['log_sqft_living'] = np.log(train_data['sqft_living'])
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']

In [6]:
test_data['bedrooms_squared'] = test_data['bedrooms'] * test_data['bedrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
test_data['log_sqft_living'] = np.log(test_data['sqft_living'])
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

 ### what are the mean (arithmetic average) values of 4 new variables on TEST data?

In [7]:
print('The mean value of bedrooms_squared: %.2f' % np.mean(test_data['bedrooms_squared']))
print('The mean value of bed_bath_rooms: %.2f' % np.mean(test_data['bed_bath_rooms']))
print('The mean value of log_sqft_living: %.2f' % np.mean(test_data['log_sqft_living']))
print('The mean value of lat_plus_long: %.2f' % np.mean(test_data['lat_plus_long']))

The mean value of bedrooms_squared: 12.45
The mean value of bed_bath_rooms: 7.50
The mean value of log_sqft_living: 7.55
The mean value of lat_plus_long: -74.65


### Calculate the coefficients/weights fro predicting house price for the following three models

- Model 1: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’, and ‘long’

- Model 2: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, and ‘bed_bath_rooms’

- Model 3: ‘sqft_living’, ‘bedrooms’, ‘bathrooms’, ‘lat’,‘long’, ‘bed_bath_rooms’, ‘bedrooms_squared’, ‘log_sqft_living’, and ‘lat_plus_long’

In [8]:
feature_model_1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
feature_model_2 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']
feature_model_3 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living', 'lat_plus_long']

### Model 1

In [9]:
X_1 = train_data[feature_model_1]
y = train_data['price']
reg_1 = LinearRegression().fit(X_1, y)
pd.DataFrame({'Feature': feature_model_1, 'Coefficient': reg_1.coef_})

Unnamed: 0,Feature,Coefficient
0,sqft_living,312.258646
1,bedrooms,-59586.533154
2,bathrooms,15706.742083
3,lat,658619.263931
4,long,-309374.351268


### Model 2

In [10]:
X_2 = train_data[feature_model_2]
y = train_data['price']
reg_2 = LinearRegression().fit(X_2, y)
pd.DataFrame({'Feature': feature_model_2, 'Coefficient': reg_2.coef_})

Unnamed: 0,Feature,Coefficient
0,sqft_living,306.610053
1,bedrooms,-113446.36807
2,bathrooms,-71461.308293
3,lat,654844.629503
4,long,-294298.969138
5,bed_bath_rooms,25579.652001


### Model 3

In [11]:
X_3 = train_data[feature_model_3]
y = train_data['price']
reg_3 = LinearRegression().fit(X_3, y)
pd.DataFrame({'Feature': feature_model_3, 'Coefficient': reg_3.coef_})

Unnamed: 0,Feature,Coefficient
0,sqft_living,529.42282
1,bedrooms,34514.229578
2,bathrooms,67060.781319
3,lat,534085.610867
4,long,-406750.710861
5,bedrooms_squared,-6788.58667
6,bed_bath_rooms,-8570.504395
7,log_sqft_living,-561831.484076
8,lat_plus_long,127334.900006


### Comparing residual sum of sqaure for training data

### Model 1

In [12]:
np.sum(np.square(np.array(y) - reg_1.predict(train_data[feature_model_1])))

967879963049545.8

### Model 2

In [13]:
np.sum(np.square(np.array(y) - reg_2.predict(train_data[feature_model_2])))

958419635074070.0

### Model 3

In [14]:
np.sum(np.square(np.array(y) - reg_3.predict(train_data[feature_model_3])))

903436455050479.8

In conclusion, Model 3 has lowest RSS on Training Data

### Comparing residual sum of sqaure for testing data

### Model 1

In [15]:
np.sum(np.square(np.array(test_data['price']) - reg_1.predict(test_data[feature_model_1])))

225500469795490.34

### Model 2

In [16]:
np.sum(np.square(np.array(test_data['price']) - reg_2.predict(test_data[feature_model_2])))

223377462976467.5

### Model 3

In [17]:
np.sum(np.square(np.array(test_data['price']) - reg_3.predict(test_data[feature_model_3])))

259236319207179.38

In conclusion, Model 1 has lowest RSS on Testing Data

In [18]:
train_data = pd.read_csv('kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype = dtype_dict)

In [19]:
train_data['constant'] = 1

In [20]:
features = ['constant'] + ['sqft_living']
features

['constant', 'sqft_living']

In [21]:
feature_matrix = train_data[features].values
feature_matrix

array([[1.00e+00, 1.18e+03],
       [1.00e+00, 2.57e+03],
       [1.00e+00, 7.70e+02],
       ...,
       [1.00e+00, 1.53e+03],
       [1.00e+00, 1.60e+03],
       [1.00e+00, 1.02e+03]])

Writing function which should return a features_matrix (2D array) consisting of first a column of ones followed by columns containing the values of the input features in the data set in the same order as the input list

In [22]:
def get_numpy_data(train_data, features, output):
    train_data['constant'] = 1
    features = ['constant'] + features
    feature_matrix = train_data[features].values
    
    output_array = np.array(train_data[output])    
    return feature_matrix, output_array

In [23]:
def predict_outcome(feature_matrix, weights):
    predictions = np.dot(feature_matrix, weights)    
    return predictions

In [24]:
def feature_derivative(errors, feature):
    derivative = 2 * np.dot(feature, errors)
    return derivative

### Gradient Descent function

In [25]:
def regression_gradient_descent(feature_amtrix, output, initial_weights, step_size, tolerance):
    converged = False
    weights = np.array(initial_weights)
    
    while not converged:
        predictions = predict_outcome(feature_matrix, weights)
        errors = predictions - output
        
        gradient_sum_squares = 0
        
        for i in range(len(weights)):
            derivative = feature_derivative(errors, feature_matrix[:, i])
            gradient_sum_squares += np.square(derivative)
            weights[i] -= step_size * derivative
            
        gradient_magnitude = np.sqrt(gradient_sum_squares)
        if gradient_magnitude < tolerance:
            converged = True
    return weights

### Running the regression_gradient_descent function on actual data

- features: ‘sqft_living’
- output: ‘price’
- initial weights: -47000, 1 (intercept, sqft_living respectively)
- step_size = 7e-12
- tolerance = 2.5e7

### Training data

In [26]:
simple_features = ['sqft_living']
my_output = 'price'

simple_feature_matrix, output = get_numpy_data(train_data, simple_features, my_output)
initial_weights = np.array([-47000., 1.])
step_size = 7e-12
tolerance = 2.5e7

In [27]:
simple_weights = regression_gradient_descent(simple_feature_matrix, output, initial_weights, step_size, tolerance)

In [28]:
simple_weights

array([-46999.88716555,    281.91211918])

In [29]:
print('Whats the value of the weight for sqft_lving? %.1f' % simple_weights[1])

Whats the value of the weight for sqft_lving? 281.9


### Testing data for prediction

In [30]:
test_simple_feature_matrix, test_output = get_numpy_data(test_data, simple_features, my_output)

In [49]:
test_prediction_simple = predict_outcome(test_simple_feature_matrix, simple_weights)

In [50]:
test_prediction_simple[0]

356134.4432550024

### Calculate RSS on Testing data

In [55]:
RSS_test_simple = np.sum(np.square(test_data['price'] - test_prediction_simple))
RSS_test_simple

275400044902128.3

### Running the regression_gradient_descent function on actual data

- model features = ‘sqft_living’, ‘sqft_living15’
- output = ‘price’
- initial weights = [-100000, 1, 1] (intercept, sqft_living, and sqft_living_15 respectively)
- step size = 4e-12
- tolerance = 1e9


In [37]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
feature_matrix, output = get_numpy_data(train_data, model_features, my_output)
initial_weights = np.array([-100000., 1., 1.])
step_size = 4e-12
tolerance = 1e9

In [38]:
multiple_weights = regression_gradient_descent(feature_matrix, output, initial_weights, step_size, tolerance)

In [39]:
multiple_weights

array([-9.99999688e+04,  2.45072603e+02,  6.52795267e+01])

### Prediction for testing data

In [41]:
test_multiple_feature_matrix, test_multiple_output = get_numpy_data(test_data, model_features, my_output)

In [42]:
test_prediction_multiple = predict_outcome(test_multiple_feature_matrix, multiple_weights)

In [43]:
test_prediction_multiple

array([366651.41162949, 762662.39850726, 386312.09557541, ...,
       682087.39916306, 585579.27901327, 216559.20391786])

In [44]:
print('What is the predicted price for the 1st house in the Test data set? %.f' % test_prediction_multiple[0])

What is the predicted price for the 1st house in the Test data set? 366651


In [47]:
print('What is the actual price for the 1st house in the Test data set? %.f' % test_data['price'][0])

What is the actual price for the 1st house in the Test data set? 310000


In [52]:
test_prediction_simple[0]

356134.4432550024

In [54]:
test_prediction_multiple[0]

366651.4116294939

### Calculate RSS on Testing data for mutlple regression

In [56]:
RSS_test_multiple = np.sum(np.square(test_data['price'] - test_prediction_multiple))
RSS_test_multiple

270263443629803.56

In [57]:
RSS_test_simple

275400044902128.3

## In Conclusions

- Simple regression model has a better prediction
- Multiple regression model has a lower RSS value