House price prediction is a problem in the real estate industry to make informed decisions. By using machine learning algorithms we can predict the price of a house based on various features such as location, size, number of bedrooms and other relevant factors. To tackle this issue we will build a machine learning model trained on the House Price Prediction Dataset.

In [5]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

In [6]:
# Load the dataset
from google.colab import drive
drive.mount('HousePricePrediction')

Drive already mounted at HousePricePrediction; to attempt to forcibly remount, call drive.mount("HousePricePrediction", force_remount=True).


In [7]:
df = pd.read_csv('HousePricePrediction/MyDrive/pytorch practice notebooks/HousePricePrediction.xlsx.csv')

In [8]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice
0,0,60,RL,8450,Inside,1Fam,5,2003,2003,VinylSd,0.0,856.0,208500.0
1,1,20,RL,9600,FR2,1Fam,8,1976,1976,MetalSd,0.0,1262.0,181500.0
2,2,60,RL,11250,Inside,1Fam,5,2001,2002,VinylSd,0.0,920.0,223500.0
3,3,70,RL,9550,Corner,1Fam,5,1915,1970,Wd Sdng,0.0,756.0,140000.0
4,4,60,RL,14260,FR2,1Fam,5,2000,2000,VinylSd,0.0,1145.0,250000.0


In [9]:
df.shape

(2919, 13)

In [10]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2918.0,2918.0,1460.0
mean,1459.0,57.137718,10168.11408,5.564577,1971.312778,1984.264474,49.582248,1051.777587,180921.19589
std,842.787043,42.517628,7886.996359,1.113131,30.291442,20.894344,169.205611,440.766258,79442.502883
min,0.0,20.0,1300.0,1.0,1872.0,1950.0,0.0,0.0,34900.0
25%,729.5,20.0,7478.0,5.0,1953.5,1965.0,0.0,793.0,129975.0
50%,1459.0,50.0,9453.0,5.0,1973.0,1993.0,0.0,989.5,163000.0
75%,2188.5,70.0,11570.0,6.0,2001.0,2004.0,0.0,1302.0,214000.0
max,2918.0,190.0,215245.0,9.0,2010.0,2010.0,1526.0,6110.0,755000.0


In [11]:
# checking there are any null values in dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2919 non-null   int64  
 1   MSSubClass    2919 non-null   int64  
 2   MSZoning      2915 non-null   object 
 3   LotArea       2919 non-null   int64  
 4   LotConfig     2919 non-null   object 
 5   BldgType      2919 non-null   object 
 6   OverallCond   2919 non-null   int64  
 7   YearBuilt     2919 non-null   int64  
 8   YearRemodAdd  2919 non-null   int64  
 9   Exterior1st   2918 non-null   object 
 10  BsmtFinSF2    2918 non-null   float64
 11  TotalBsmtSF   2918 non-null   float64
 12  SalePrice     1460 non-null   float64
dtypes: float64(3), int64(6), object(4)
memory usage: 296.6+ KB


In [12]:
# Fill missing values in 'SalePrice' with the mean of the column
df['SalePrice'] = df['SalePrice'].fillna(df['SalePrice'].mean())

In [13]:
# Check info again after handling missing values in 'SalePrice'
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2919 entries, 0 to 2918
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            2919 non-null   int64  
 1   MSSubClass    2919 non-null   int64  
 2   MSZoning      2915 non-null   object 
 3   LotArea       2919 non-null   int64  
 4   LotConfig     2919 non-null   object 
 5   BldgType      2919 non-null   object 
 6   OverallCond   2919 non-null   int64  
 7   YearBuilt     2919 non-null   int64  
 8   YearRemodAdd  2919 non-null   int64  
 9   Exterior1st   2918 non-null   object 
 10  BsmtFinSF2    2918 non-null   float64
 11  TotalBsmtSF   2918 non-null   float64
 12  SalePrice     2919 non-null   float64
dtypes: float64(3), int64(6), object(4)
memory usage: 296.6+ KB


In [14]:
# Drop rows with any remaining missing values
df1 = df.dropna()

In [15]:
# Check for remaining null values
df1.isnull().sum()

Unnamed: 0,0
Id,0
MSSubClass,0
MSZoning,0
LotArea,0
LotConfig,0
BldgType,0
OverallCond,0
YearBuilt,0
YearRemodAdd,0
Exterior1st,0


In [16]:
# Drop the 'Id' column as it's not needed for prediction
df1.drop(['Id'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1.drop(['Id'], axis=1, inplace=True)


In [17]:
df1.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,count
MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF,SalePrice,Unnamed: 12_level_1
160,RM,1680,Inside,Twnhs,5,1973,1973,HdBoard,0.0,483.0,180921.19589,2
120,RM,3843,Inside,TwnhsE,5,2007,2008,CemntBd,0.0,1596.0,180921.19589,2
90,RL,7018,Inside,Duplex,5,1979,1979,HdBoard,0.0,0.0,180921.19589,2
120,RM,4435,Inside,TwnhsE,5,2003,2003,VinylSd,0.0,848.0,180921.19589,2
180,RM,3675,Inside,TwnhsE,5,2005,2006,VinylSd,0.0,547.0,180921.19589,2
...,...,...,...,...,...,...,...,...,...,...,...,...
20,RL,16300,CulDSac,1Fam,4,1977,1977,HdBoard,417.0,876.0,180921.19589,1
20,RL,16321,CulDSac,1Fam,6,1957,1997,MetalSd,0.0,1484.0,207500.00000,1
20,RL,16381,Inside,1Fam,5,1969,1969,Plywood,0.0,1844.0,223000.00000,1
20,RL,16492,Corner,1Fam,6,1966,2002,BrkFace,713.0,1517.0,190000.00000,1


In [18]:
# Perform one-hot encoding on specified categorical columns
one_hot_encoded_data = pd.get_dummies(df1, columns = ['MSZoning', 'LotConfig','BldgType','Exterior1st'])

In [19]:
one_hot_encoded_data.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice,MSZoning_C (all),MSZoning_FV,...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,60,8450,5,2003,2003,0.0,856.0,208500.0,False,False,...,False,False,False,False,False,False,False,True,False,False
1,20,9600,8,1976,1976,0.0,1262.0,181500.0,False,False,...,False,False,False,True,False,False,False,False,False,False
2,60,11250,5,2001,2002,0.0,920.0,223500.0,False,False,...,False,False,False,False,False,False,False,True,False,False
3,70,9550,5,1915,1970,0.0,756.0,140000.0,False,False,...,False,False,False,False,False,False,False,False,True,False
4,60,14260,5,2000,2000,0.0,1145.0,250000.0,False,False,...,False,False,False,False,False,False,False,True,False,False


In [20]:
one_hot_encoded_data.shape

(2913, 38)

In [21]:
# Replace boolean values (True/False) with numerical values (1/0)
df3 = one_hot_encoded_data.replace([False, True],[0,1])

  df3 = one_hot_encoded_data.replace([False, True],[0,1])


In [22]:
# Normalization
for i in df3.columns:
  df3[i] = ((df3[i]-df3[i].min())/(df3[i].max()-df3[i].min()))

In [23]:
df3.head()

Unnamed: 0,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice,MSZoning_C (all),MSZoning_FV,...,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing
0,0.235294,0.03342,0.5,0.949275,0.883333,0.0,0.140098,0.241078,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.038795,0.875,0.753623,0.433333,0.0,0.206547,0.203583,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.235294,0.046507,0.5,0.934783,0.866667,0.0,0.150573,0.261908,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.294118,0.038561,0.5,0.311594,0.333333,0.0,0.123732,0.145952,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.235294,0.060576,0.5,0.927536,0.833333,0.0,0.187398,0.298709,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
df3.columns

Index(['MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF2', 'TotalBsmtSF', 'SalePrice', 'MSZoning_C (all)',
       'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2',
       'LotConfig_FR3', 'LotConfig_Inside', 'BldgType_1Fam', 'BldgType_2fmCon',
       'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE',
       'Exterior1st_AsbShng', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm',
       'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd',
       'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd',
       'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco',
       'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing'],
      dtype='object')

In [25]:
# Define the input columns for the model
input_cols = ['MSSubClass', 'LotArea', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'BsmtFinSF2', 'TotalBsmtSF', 'MSZoning_C (all)',
       'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2',
       'LotConfig_FR3', 'LotConfig_Inside', 'BldgType_1Fam', 'BldgType_2fmCon',
       'BldgType_Duplex', 'BldgType_Twnhs', 'BldgType_TwnhsE',
       'Exterior1st_AsbShng', 'Exterior1st_AsphShn', 'Exterior1st_BrkComm',
       'Exterior1st_BrkFace', 'Exterior1st_CBlock', 'Exterior1st_CemntBd',
       'Exterior1st_HdBoard', 'Exterior1st_ImStucc', 'Exterior1st_MetalSd',
       'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco',
       'Exterior1st_VinylSd', 'Exterior1st_Wd Sdng', 'Exterior1st_WdShing']

In [26]:
len(input_cols)

37

In [27]:
# Define the output column (target variable)
output_cols = ['SalePrice']

In [28]:
# Create a deep copy of the dataframe
df4 = df3.copy(deep = True)
# Convert input and output columns to numpy arrays with float32 data type
input_array = df4[input_cols].to_numpy(dtype='float32')
output_array = df4[output_cols].to_numpy(dtype='float32')

In [29]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

In [30]:
# Convert numpy arrays to PyTorch tensors
inputs = torch.from_numpy(input_array)
targets = torch.from_numpy(output_array)

In [31]:
targets.shape

torch.Size([2913, 1])

In [32]:
# Create a TensorDataset from input and target tensors
dataset  = TensorDataset(inputs, targets)

In [33]:
# Calculate the size of the training and validation sets
train_size = int(len(dataset)*0.9)
valid_size = len(dataset) - (train_size)

In [34]:
# Print the size of the validation set and test set
valid_size, train_size

(292, 2621)

In [35]:
# Split the dataset into training and validation sets
train_data, valid_data = random_split(dataset, [train_size, valid_size])

In [36]:
# Define the batch size for data loaders
batch_size = 5
train_loader  = DataLoader(train_data, batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size)

In [37]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(21)

<torch._C.Generator at 0x7df459b59c30>

In [38]:
# Define the neural network model for house price prediction
class HousePricePrediction(nn.Module):

  def __init__(self):
    super().__init__()
    self.linear1 = nn.Linear(37, 100)
    self.linear2 = nn.Linear(100, 1)


  def forward(self, xb):
    out = self.linear1(xb)
    out = F.relu(out)
    out = self.linear2(out)
    return out

  def training_step(self, batch):
    # print(batch[0])
    input, target = batch
    # target = target.type(torch.LongTensor)
    out = self(input)
    loss = nn.MSELoss()(out, target)
    return loss

  def validation_step(self, batch):
    input, target = batch
    out = self(input)
    loss = nn.MSELoss()(out, target)
    return{'valid_loss' : loss}

  def validation_epoch_end(self, outputs):
    batch_losses = [x['valid_loss'] for x in outputs]
    epoch_loss = torch.stack(batch_losses).mean()
    return{'valid_loss': epoch_loss.item()}


  def epoch_end(self, epoch, result, epochs):
    if ((epoch+1) % 2 == 0 or epoch == epochs-1):
      print("Epoch [{}], val_loss: {:.4f}, train_loss: {:.4f}".format(epoch, result['valid_loss'], result['train_loss']))


In [39]:
# Function to evaluate the model on the validation set
def evaluate(model, valid_loader):
  outputs = [model.validation_step(batch) for batch in valid_loader]
  return model.validation_epoch_end(outputs)

In [40]:
# Function to train the model
from tqdm import tqdm
def fit(epochs, lr, model, train_loader, valid_loader, opt_func=torch.optim.Adam):
  history = []
  optimizer = opt_func(model.parameters(), lr)
  for epoch in range(epochs):
    train_losses = []
    for batch in train_loader:
      loss = model.training_step(batch)
      train_losses.append(loss)
      loss.backward() #calculate gradients
      optimizer.step()
      optimizer.zero_grad()
    # print(train_losses)
    result = evaluate(model, valid_loader)
    result['train_loss'] = torch.stack(train_losses).mean().item()
    model.epoch_end(epoch, result, epochs)
    history.append(result)
    # print(model.parameters())
  return history

In [41]:
# Create an instance of the HousePricePrediction model
model = HousePricePrediction()

In [42]:
epochs = 30
lr = 0.0001
history = fit(epochs, lr, model, train_loader, valid_loader)

Epoch [1], val_loss: 0.0041, train_loss: 0.0053
Epoch [3], val_loss: 0.0038, train_loss: 0.0048
Epoch [5], val_loss: 0.0038, train_loss: 0.0046
Epoch [7], val_loss: 0.0038, train_loss: 0.0044
Epoch [9], val_loss: 0.0038, train_loss: 0.0043
Epoch [11], val_loss: 0.0038, train_loss: 0.0042
Epoch [13], val_loss: 0.0038, train_loss: 0.0042
Epoch [15], val_loss: 0.0039, train_loss: 0.0041
Epoch [17], val_loss: 0.0039, train_loss: 0.0041
Epoch [19], val_loss: 0.0040, train_loss: 0.0040
Epoch [21], val_loss: 0.0040, train_loss: 0.0040
Epoch [23], val_loss: 0.0041, train_loss: 0.0040
Epoch [25], val_loss: 0.0041, train_loss: 0.0039
Epoch [27], val_loss: 0.0042, train_loss: 0.0039
Epoch [29], val_loss: 0.0041, train_loss: 0.0039


In [44]:
# Make prediction on the same sample
for i in range(10):
  input, target = valid_data[i]
  pred = model(input)
  print(pred, target)

tensor([0.2029], grad_fn=<ViewBackward0>) tensor([0.2028])
tensor([0.1840], grad_fn=<ViewBackward0>) tensor([0.2041])
tensor([0.1794], grad_fn=<ViewBackward0>) tensor([0.2028])
tensor([0.2133], grad_fn=<ViewBackward0>) tensor([0.1640])
tensor([0.2139], grad_fn=<ViewBackward0>) tensor([0.1668])
tensor([0.2490], grad_fn=<ViewBackward0>) tensor([0.2028])
tensor([0.2526], grad_fn=<ViewBackward0>) tensor([0.2028])
tensor([0.1646], grad_fn=<ViewBackward0>) tensor([0.2028])
tensor([0.1686], grad_fn=<ViewBackward0>) tensor([0.1737])
tensor([0.2377], grad_fn=<ViewBackward0>) tensor([0.2028])


In [None]:
input, targets = valid_data[200]
a = model(input)
# _ , pred = torch.max(a, dim=0)
# print(targets, pred)
print(targets, a)