# Kaggle - House Price Prediction Using Pytorch (Tabular Dataset)

1. Theoretical Knowledge of Deep Learning
2. ANN(Artificial Neural Network with Pytorch)
3. Feature Engineering {Categorical - Embedding Layer, Continuous Variable}
4. Pythonic Class to create feed forward neural networks

Dataset -> features{categorical, continuous}
Pytorch -> Tabular Dataset

Categorical Features - embedding layers
continuous Features

1. Categorical Features - embedding layers
a) Label Encoding
b) take all categorical featuers {numpy, torch -> tensors}
c) Lets take all the continuous values
d) Continuouse -> Numpy -> Torch -> tensors
e) Embedding Layers -> Categorical Features

### 1. Category Embedding

In [1]:
import pandas as pd
import numpy as np
from patsy.test_highlevel import test_categorical

In [2]:
df = pd.read_csv("houseprice.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()

In [3]:
df.shape

(1201, 10)

In [4]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


In [6]:
for i in df.columns:
    print("Column name {} and unique values are {}".format(i, len(df[i].unique())))

Column name MSSubClass and unique values are 15
Column name MSZoning and unique values are 5
Column name LotFrontage and unique values are 110
Column name LotArea and unique values are 869
Column name Street and unique values are 2
Column name LotShape and unique values are 4
Column name YearBuilt and unique values are 112
Column name 1stFlrSF and unique values are 678
Column name 2ndFlrSF and unique values are 368
Column name SalePrice and unique values are 597


In [7]:
import datetime
datetime.datetime.now().year

2024

In [8]:
df['Total Years']=datetime.datetime.now().year-df['YearBuilt']

In [9]:
df.drop("YearBuilt", axis=1, inplace=True)
df.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', '1stFlrSF', '2ndFlrSF', 'SalePrice', 'Total Years'],
      dtype='object')

In [10]:
cat_features = ["MSSubClass", "MSZoning", "Street", "LotShape"]
out_feature = "SalePrice"

In [11]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders = {}
lbl_encoders["MSSubClass"] = LabelEncoder()
lbl_encoders["MSSubClass"].fit_transform(df["MSSubClass"])

array([5, 0, 5, ..., 6, 0, 0])

In [12]:
lbl_encoders

{'MSSubClass': LabelEncoder()}

In [13]:
from sklearn.preprocessing import LabelEncoder
lbl_encoders={}
for feature in cat_features:
    lbl_encoders[feature] = LabelEncoder()
    df[feature] = lbl_encoders[feature].fit_transform(df[feature])
    
df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,Total Years
0,5,3,65.0,8450,1,3,856,854,208500,21
1,0,3,80.0,9600,1,3,1262,0,181500,48
2,5,3,68.0,11250,1,0,920,866,223500,23
3,6,3,60.0,9550,1,0,961,756,140000,109
4,5,3,84.0,14260,1,0,1145,1053,250000,24
...,...,...,...,...,...,...,...,...,...,...
1455,5,3,62.0,7917,1,3,953,694,175000,25
1456,0,3,85.0,13175,1,3,2073,0,210000,46
1457,6,3,66.0,9042,1,3,1188,1152,266500,83
1458,0,3,68.0,9717,1,3,1078,0,142125,74


## Stacking and Coverting into Tensors

In [14]:
cat_features = np.stack([df['MSSubClass'], df['MSZoning'], df['Street'], df['LotShape']], 1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]])

In [15]:
!pip3 install torch torchvision torchaudio




In [16]:
import torch
cat_features = torch.tensor(cat_features, dtype=torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

## Create Continuous Variable

In [17]:
cont_features = []
for i in df.columns:
    if i in ["MSSubClass", "MSZoning", "Street", "LotShape", "SalePrice"]:
        pass
    else:
        cont_features.append(i)
        
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'Total Years']

## Stacking continuous variable to a tensor

In [20]:
cont_values = np.stack([df[i].values for i in cont_features], axis=1)
cont_values = torch.tensor(cont_values, dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    21.],
        [   80.,  9600.,  1262.,     0.,    48.],
        [   68., 11250.,   920.,   866.,    23.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    83.],
        [   68.,  9717.,  1078.,     0.,    74.],
        [   75.,  9937.,  1256.,     0.,    59.]])

In [21]:
cont_values.dtype

torch.float32

In [22]:
y = torch.tensor(df['SalePrice'].values, dtype=torch.float).reshape(-1, 1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   int64  
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   int64  
 5   LotShape     1201 non-null   int64  
 6   1stFlrSF     1201 non-null   int64  
 7   2ndFlrSF     1201 non-null   int64  
 8   SalePrice    1201 non-null   int64  
 9   Total Years  1201 non-null   int64  
dtypes: float64(1), int64(9)
memory usage: 103.2 KB


In [24]:
cat_features.shape, cont_values.shape, y.shape

(torch.Size([1201, 4]), torch.Size([1201, 5]), torch.Size([1201, 1]))

In [25]:
len(df['MSSubClass'].unique())

15

## Embedding Size for categorical columns

In [26]:
cat_dims = [len(df[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]
cat_dims

[15, 5, 2, 4]

In [27]:
# Output dimension should be set based on the input dimension(min(50, feature dimension / 2))
embedding_dim = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
# Preprocessing step
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
embed_representation = nn.ModuleList([nn.Embedding(inp, out) for inp, out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [29]:
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [30]:
cat_features = cat_features[:4]
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [31]:
pd.set_option('display.max_rows', 500)
embedding_val = []
for i,e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

embedding_val

[tensor([[-0.5066,  0.5028,  0.4615, -0.9150,  0.7739,  2.2989,  0.0451, -0.2518],
         [-0.3281, -0.1493,  0.5125,  1.4617,  0.8649,  1.4360,  0.1237,  1.5567],
         [-0.5066,  0.5028,  0.4615, -0.9150,  0.7739,  2.2989,  0.0451, -0.2518],
         [-0.4618,  0.0544, -0.5698, -0.1233,  0.9962, -0.4440, -0.0993,  1.1003]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[0.1214, 1.0048, 1.0286],
         [0.1214, 1.0048, 1.0286],
         [0.1214, 1.0048, 1.0286],
         [0.1214, 1.0048, 1.0286]], grad_fn=<EmbeddingBackward0>),
 tensor([[0.1769],
         [0.1769],
         [0.1769],
         [0.1769]], grad_fn=<EmbeddingBackward0>),
 tensor([[-1.7559, -0.5334],
         [-1.7559, -0.5334],
         [ 0.6837,  0.4833],
         [ 0.6837,  0.4833]], grad_fn=<EmbeddingBackward0>)]

In [32]:
z = torch.cat(embedding_val, 1)
z

tensor([[-0.5066,  0.5028,  0.4615, -0.9150,  0.7739,  2.2989,  0.0451, -0.2518,
          0.1214,  1.0048,  1.0286,  0.1769, -1.7559, -0.5334],
        [-0.3281, -0.1493,  0.5125,  1.4617,  0.8649,  1.4360,  0.1237,  1.5567,
          0.1214,  1.0048,  1.0286,  0.1769, -1.7559, -0.5334],
        [-0.5066,  0.5028,  0.4615, -0.9150,  0.7739,  2.2989,  0.0451, -0.2518,
          0.1214,  1.0048,  1.0286,  0.1769,  0.6837,  0.4833],
        [-0.4618,  0.0544, -0.5698, -0.1233,  0.9962, -0.4440, -0.0993,  1.1003,
          0.1214,  1.0048,  1.0286,  0.1769,  0.6837,  0.4833]],
       grad_fn=<CatBackward0>)

## Implement dropupout

In [33]:
dropout = nn.Dropout(0.4)
final_embed = dropout(z)
final_embed

tensor([[-0.0000,  0.8379,  0.0000, -0.0000,  0.0000,  0.0000,  0.0000, -0.4196,
          0.0000,  0.0000,  1.7143,  0.2949, -2.9265, -0.0000],
        [-0.5468, -0.2489,  0.0000,  2.4361,  0.0000,  0.0000,  0.2061,  2.5945,
          0.0000,  0.0000,  1.7143,  0.0000, -2.9265, -0.8890],
        [-0.0000,  0.8379,  0.7692, -0.0000,  1.2898,  0.0000,  0.0752, -0.4196,
          0.0000,  1.6747,  1.7143,  0.2949,  1.1395,  0.0000],
        [-0.0000,  0.0000, -0.9496, -0.0000,  0.0000, -0.7400, -0.1655,  1.8339,
          0.2023,  1.6747,  0.0000,  0.0000,  0.0000,  0.8055]],
       grad_fn=<MulBackward0>)

## Create a Feed Forward Neural Network

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class FeedForwardNN(nn.Module):
    def __init__(self, embedding_dim, n_cont, out_sz, layers, p = 0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp, out) for inp, out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerList = []
        n_emb = sum((out for inp, out in embedding_dim))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerList.append(nn.Linear(n_in, i))
            layerList.append(nn.ReLU(inplace=True))
            layerList.append(nn.BatchNorm1d(i))
            layerList.append(nn.Dropout(p))
            n_in = i
        layerList.append(nn.Linear(layers[-1], out_sz))
        self.layers = nn.Sequential(*layerList)
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:, i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        return 1
    

In [35]:
len(cont_features)

5

In [36]:
torch.manual_seed(100)
model = FeedForwardNN(embedding_dim, len(cont_features), 1, [100, 50], p = 0.4)

In [37]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

## Define Loss and Optimizer

In [39]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [40]:
df.shape

(1201, 10)

In [41]:
cont_values

tensor([[   65.,  8450.,   856.,   854.,    21.],
        [   80.,  9600.,  1262.,     0.,    48.],
        [   68., 11250.,   920.,   866.,    23.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    83.],
        [   68.,  9717.,  1078.,     0.,    74.],
        [   75.,  9937.,  1256.,     0.,    59.]])

In [42]:
cont_values.shape

torch.Size([1201, 5])

In [46]:
batch_size = 1200
test_size = int(batch_size * 0.15)
train_categorical = cat_features[:batch_size - test_size]
test_categorical = cat_features[batch_size - test_size:batch_size]
train_cont = cont_values[:batch_size - test_size]
test_cont = cont_values[batch_size - test_size:batch_size]
y_train = y[:batch_size - test_size]
y_test = y[batch_size - test_size:batch_size]

In [47]:
len(train_categorical), len(test_categorical), len(train_cont), len(test_cont), len(y_train), len(y_test)

(4, 0, 1020, 180, 1020, 180)

In [None]:
epochs = 5000
final_losses = []
for i in range(epochs):
    i = i + 1
    y_pred = model(train_categorical, train_cont)
    loss = torch.sqrt(loss_function(y_pred, y_train)) # RMSE
    final_losses.append(loss)
    if i % 10 == 1:
        print("Epoch number: {} and the loss {}".format(i, loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(range(epochs), final_losses)
plt.ylabel("RMSE Loss")
plt.xlabel("Epochs")

## Validate the Test Data

In [None]:
y_pred = ""
with torch.no_grad():
    y_pred = model(test_categorical, test_cont)
    loss = torch.sqrt(loss_function(y_pred, y_test))
print("RMSE: {}".format(loss))

In [None]:
data_verify = pd.DataFrame(y_test.tolist(), columns=["Test"])

In [None]:
data_predicted = pd.DataFrame(y_pred.tolist(), columns=["Prediction"])

In [None]:
data_predicted

In [None]:
final_output = pd.concat([data_verify, data_predicted], axis=1)
final_output['Difference'] = final_output['Test'] - final_output['Prediction']
final_output.head()

In [None]:
## Saving the Model
torch.save(model, 'HousePrice.pt')

In [None]:
torch.save(model.state_dict(), 'HouseWeights.pt')

In [None]:
## Loading the saved Model
embs_size = [(15, 8), (5, 3), (2, 1), (4, 2)]
model1 = FeedForwardNN(embs_size, 5, 1, [100, 50], p=0.4)

In [None]:
model1.load_state_dict(torch.load('HouseWeights.pt'))

In [None]:
model1.eval()