# Programming Assignment 1: Predicting House Prices


In [15]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
torch.__version__
pd.__version__

'1.4.1'

#### Read the CSV into a pandas DataFrame

Using random_state, with a given DataFrame, the sample will always fetch same rows. If random_state is None or np.random, then a randomly-initialized RandomState object is returned.

In [16]:
df = pd.read_csv("data/home_data.csv")
# print(df)
# print(df['price'][0])
df.sample(n=3, random_state=0)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
17384,1453602313,20141029T000000,297000,2,1.5,1430,1650,3.0,0,0,...,7,1430,0,1999,0,98125,47.7222,-122.29,1430,1650
722,2225059214,20140808T000000,1578000,4,3.25,4670,51836,2.0,0,0,...,12,4670,0,1988,0,98005,47.635,-122.164,4230,41075
2680,2768000270,20140625T000000,562100,2,0.75,1440,3700,1.0,0,0,...,7,1200,240,1914,0,98107,47.6707,-122.364,1440,4300


#### Splitting the data into training and test sets

Use Scikit Learn's <code>train_test_split</code> to split dataframes.

When comparing machine learning algorithms, it's desirable that they are fit and evaluated on the same subsets of the dataset. This can be achieved by setting the <code>random_state</code> to an integer value. For more information, see [Repeatable Train-Test Splits](https://machinelearningmastery.com/train-test-split-for-evaluating-machine-learning-algorithms/).

In [17]:
train, test = train_test_split(df, test_size=0.2, random_state=1)
type(train)

pandas.core.frame.DataFrame

The mean price of our training set.

In [18]:
train['price'].mean()

537880.6155002891

### Prepare the data
We need the data as PyTorch tensors for usage in our model. We will need to convert pandas DataFrame to Numpy Array and them Pytorch Tensor. 

To convert a Pandas DataFrame to a Numpy Array, use <code>to_numpy()</code>. To convert a Numpy Array to a PyTorch Tensor, use <code>torch.from_numpy()</code>. For more information, see [this page](https://medium.com/@thackerhelik/linear-regression-in-pytorch-3793d89ff3f).

In [20]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

def df_to_array(dataframe, features, output=['price']):
    """
    Parameters
    ----------
    dataframe: pandas DataFrame
        train or test set
    features: list of strings
        features used to predict sales price of house
    output: list of strings
        the observation variable
    """
    df1 = dataframe.copy(deep=True)
    inputs_array = df1[features].to_numpy()
    targets_array = df1[output].to_numpy()
    return inputs_array, targets_array

inputs_array, targets_array = df_to_array(train, ['sqft_living'], ['price'])
targets_array
#inputs.dtype

array([[ 353000],
       [ 300523],
       [ 435000],
       ...,
       [ 650000],
       [ 437000],
       [1025000]], dtype=int64)

In [21]:
inputs_train = torch.from_numpy(inputs_array).float()
targets_train = torch.from_numpy(targets_array).float()
inputs_train, targets_train

(tensor([[2190.],
         [2370.],
         [1230.],
         ...,
         [1970.],
         [1980.],
         [3760.]]),
 tensor([[ 353000.],
         [ 300523.],
         [ 435000.],
         ...,
         [ 650000.],
         [ 437000.],
         [1025000.]]))

In [None]:
dataset = TensorDataset(inputs, targets)
train_loader = DataLoader(dataset, batch_size=50, shuffle=False)
for xb, yb in train_loader:
    print("inputs:", xb)
    print("targets:", yb)
    break

In [None]:
#inputs = torch.from_numpy(train[['sqft_living']].to_numpy()).float() # Must be df[[feature]] to ensure 2D array
#inputs
#targets = torch.from_numpy(train[['price']].to_numpy()).float()
#targets
#targets.dtype
train[['price']].to_numpy()

### Create a linear regression model
#### Define the regression class

In [7]:
class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        self.linear = torch.nn.Linear(input_dim, output_dim, bias=True) # bias default True
        
    def forward(self, x):
        out = self.linear(x)
        return out

#### Instantiate model

In [32]:
input_dim = 1
output_dim = 1
model1 = LinearRegression(input_dim, output_dim)

#### Compute RMSE Loss
Compute RMSE instead of MSE for this task.

In [33]:
def RMSELoss(prediction, target):
    loss_function = nn.MSELoss()
    RMSE_loss = torch.sqrt(loss_function(prediction, target))
    return RMSE_loss

#### Optimizer

In [38]:
learningRate = 0.2 # Any small positive value between 0 and 1, try different learning rates
optimizer = torch.optim.SGD(model1.parameters(), lr=learningRate)

#### Training the model

In [39]:
def train(model, x, y, epochs):
    for epoch in range(epochs):
        # Clear gradient buffers so that gradient from previous epoch is not carried to current one
        optimizer.zero_grad()
        
        # Output by model
        y_pred = model(x)
        
        RMSE_loss = RMSELoss(y_pred, y)
        print(RMSE_loss)
        RMSE_loss.backward()
        
        # Updates the parameters
        optimizer.step()

        print('epoch {}, loss {}'.format(epoch, RMSE_loss.item()))
        
train(model1, inputs_train, targets_train, 50)

tensor(1288190.8750, grad_fn=<SqrtBackward0>)
epoch 0, loss 1288190.875
tensor(362256.4688, grad_fn=<SqrtBackward0>)
epoch 1, loss 362256.46875
tensor(536620.6250, grad_fn=<SqrtBackward0>)
epoch 2, loss 536620.625
tensor(498879.8438, grad_fn=<SqrtBackward0>)
epoch 3, loss 498879.84375
tensor(518578.1562, grad_fn=<SqrtBackward0>)
epoch 4, loss 518578.15625
tensor(508694.5312, grad_fn=<SqrtBackward0>)
epoch 5, loss 508694.53125
tensor(513773.3438, grad_fn=<SqrtBackward0>)
epoch 6, loss 513773.34375
tensor(511192.1562, grad_fn=<SqrtBackward0>)
epoch 7, loss 511192.15625
tensor(512511.7812, grad_fn=<SqrtBackward0>)
epoch 8, loss 512511.78125
tensor(511839.0938, grad_fn=<SqrtBackward0>)
epoch 9, loss 511839.09375
tensor(512182.5000, grad_fn=<SqrtBackward0>)
epoch 10, loss 512182.5
tensor(512007.2500, grad_fn=<SqrtBackward0>)
epoch 11, loss 512007.25
tensor(512096.6562, grad_fn=<SqrtBackward0>)
epoch 12, loss 512096.65625
tensor(512051., grad_fn=<SqrtBackward0>)
epoch 13, loss 512051.0
tenso

#### Model Parameters

In [42]:
for name, param in model1.named_parameters():
    if param.requires_grad:
        print(name, param.data)

linear.weight tensor([[64.9477]])
linear.bias tensor([-0.3015])


## Putting it all together

#### TODO: Write a function that prepares the data based on features we want to include

### Task 1: Selection and Summary Statistics
One neighborhood of Seattle has the highest average house sale price. Note down the ZIP code of this neighborhood and compute the average price. *Save this result to answer the quiz at the end.*

### Task 2: Filtering Data
- Use logical filters to select rows of an SFrame.
- Using such filters, first select the houses that have 'sqft_living' higher than 2000 sqft but no larger than 4000 sqft.
- What fraction of the all houses have 'sqft_living' in this range? *Save this result to answer the quiz at the end.*


### Task 3: Building the Model
Use the original dataset and build a model using the following features: bedrooms, bathrooms, sqft_living, sqft_lot, floors, zipcode, condition, grade, waterfront, view, sqft_above, sqft_basement, yr_built, yr_renovated, lat, long, sqft_living15, sqft_lot15.

Compute the RMSE (root mean squared error) on the test data for the model using basic features and advanced features.

Note:
- When doing the train-test split, make sure to use seed=0.
- RMSE is the square root of the mean RSS.

In [23]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [24]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

## Sources
[Introductory Guide to PyTorch Using a Linear Regression Problem](https://analyticsindiamag.com/introductory-guide-to-pytorch-using-a-linear-regression-problem/)

[Linear Regression and Gradient Descent in PyTorch](https://www.analyticsvidhya.com/blog/2021/08/linear-regression-and-gradient-descent-in-pytorch/)

[Interesting Ways to Select Pandas DataFrame Columns](https://towardsdatascience.com/interesting-ways-to-select-pandas-dataframe-columns-b29b82bbfb33#:~:text=This%20is%20the%20most%20basic,Returns%20a%20pandas%20series.&text=Passing%20a%20list%20in%20the,columns%20at%20the%20same%20time.)