In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("houseprice.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street", "LotShape", "YearBuilt", "1stFlrSF", "2ndFlrSF"]).dropna()

In [3]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000


In [4]:
data.shape

(1201, 10)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1201 entries, 0 to 1459
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MSSubClass   1201 non-null   int64  
 1   MSZoning     1201 non-null   object 
 2   LotFrontage  1201 non-null   float64
 3   LotArea      1201 non-null   int64  
 4   Street       1201 non-null   object 
 5   LotShape     1201 non-null   object 
 6   YearBuilt    1201 non-null   int64  
 7   1stFlrSF     1201 non-null   int64  
 8   2ndFlrSF     1201 non-null   int64  
 9   SalePrice    1201 non-null   int64  
dtypes: float64(1), int64(6), object(3)
memory usage: 103.2+ KB


In [6]:
for i in data.columns:
        print(f"Column name : {i}, Unique Value : {len(data[i].unique())}")

Column name : MSSubClass, Unique Value : 15
Column name : MSZoning, Unique Value : 5
Column name : LotFrontage, Unique Value : 110
Column name : LotArea, Unique Value : 869
Column name : Street, Unique Value : 2
Column name : LotShape, Unique Value : 4
Column name : YearBuilt, Unique Value : 112
Column name : 1stFlrSF, Unique Value : 678
Column name : 2ndFlrSF, Unique Value : 368
Column name : SalePrice, Unique Value : 597


In [7]:
import datetime
datetime.datetime.now().year

2023

In [8]:
data["House_Old"] = datetime.datetime.now().year - data["YearBuilt"]

In [9]:
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,YearBuilt,1stFlrSF,2ndFlrSF,SalePrice,House_Old
0,60,RL,65.0,8450,Pave,Reg,2003,856,854,208500,20
1,20,RL,80.0,9600,Pave,Reg,1976,1262,0,181500,47
2,60,RL,68.0,11250,Pave,IR1,2001,920,866,223500,22
3,70,RL,60.0,9550,Pave,IR1,1915,961,756,140000,108
4,60,RL,84.0,14260,Pave,IR1,2000,1145,1053,250000,23


In [10]:
data.drop("YearBuilt", axis=1, inplace=True)

In [11]:
cat_features = ["MSSubClass", "MSZoning", "Street", "LotShape"]
out_features = "SalePrice"

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
encoder = LabelEncoder()

for feature in cat_features:
    data[feature] = encoder.fit_transform(data[feature])
    
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,1stFlrSF,2ndFlrSF,SalePrice,House_Old
0,5,3,65.0,8450,1,3,856,854,208500,20
1,0,3,80.0,9600,1,3,1262,0,181500,47
2,5,3,68.0,11250,1,0,920,866,223500,22
3,6,3,60.0,9550,1,0,961,756,140000,108
4,5,3,84.0,14260,1,0,1145,1053,250000,23


In [14]:
import numpy as np
cat_features = np.stack([data["MSSubClass"], data["MSZoning"], data["Street"], data["LotShape"]], 1)
cat_features

array([[5, 3, 1, 3],
       [0, 3, 1, 3],
       [5, 3, 1, 0],
       ...,
       [6, 3, 1, 3],
       [0, 3, 1, 3],
       [0, 3, 1, 3]])

In [15]:
import torch
cat_features = torch.tensor(cat_features, dtype = torch.int64)
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [16]:
cont_features = []
for i in data.columns:
    if i not in ["MSSubClass", "MSZoning", "Street", "LotShape", "SalePrice"]:
        cont_features.append(i)
        
cont_features

['LotFrontage', 'LotArea', '1stFlrSF', '2ndFlrSF', 'House_Old']

In [17]:
### Stacking continuous variable to a tensor
cont_values=np.stack([data[i].values for i in cont_features],axis=1)
cont_values=torch.tensor(cont_values,dtype=torch.float)
cont_values

tensor([[   65.,  8450.,   856.,   854.,    20.],
        [   80.,  9600.,  1262.,     0.,    47.],
        [   68., 11250.,   920.,   866.,    22.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    82.],
        [   68.,  9717.,  1078.,     0.,    73.],
        [   75.,  9937.,  1256.,     0.,    58.]])

In [51]:
len(cont_values)

1201

In [19]:
y = torch.tensor(data["SalePrice"].values, dtype=torch.float).reshape(-1, 1)
y

tensor([[208500.],
        [181500.],
        [223500.],
        ...,
        [266500.],
        [142125.],
        [147500.]])

In [20]:
print(cat_features.shape)
print(cont_values.shape)
print(y.shape)

torch.Size([1201, 4])
torch.Size([1201, 5])
torch.Size([1201, 1])


In [21]:
cat_dims = [len(data[col].unique()) for col in ["MSSubClass", "MSZoning", "Street", "LotShape"]]
cat_dims

[15, 5, 2, 4]

In [22]:
embedding_dim = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [24]:
embed_representation = nn.ModuleList([nn.Embedding(inp, out) for inp, out in embedding_dim])
embed_representation

ModuleList(
  (0): Embedding(15, 8)
  (1): Embedding(5, 3)
  (2): Embedding(2, 1)
  (3): Embedding(4, 2)
)

In [25]:
cat_features

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        ...,
        [6, 3, 1, 3],
        [0, 3, 1, 3],
        [0, 3, 1, 3]])

In [26]:
cat_featuresz = cat_features[:4]
cat_featuresz

tensor([[5, 3, 1, 3],
        [0, 3, 1, 3],
        [5, 3, 1, 0],
        [6, 3, 1, 0]])

In [27]:
pd.set_option("display.max_rows", 500)
embedding_val = []
for i, e in enumerate(embed_representation):
    embedding_val.append(e(cat_features[:,i]))

In [28]:
embedding_val

[tensor([[-0.5685, -1.1374, -0.6512,  ...,  0.8755,  0.9425, -0.6368],
         [-1.2457, -0.8277, -0.5427,  ...,  1.9971,  0.1442,  0.3059],
         [-0.5685, -1.1374, -0.6512,  ...,  0.8755,  0.9425, -0.6368],
         ...,
         [ 3.5337,  0.3586, -0.1212,  ...,  0.0297, -1.3739,  1.0510],
         [-1.2457, -0.8277, -0.5427,  ...,  1.9971,  0.1442,  0.3059],
         [-1.2457, -0.8277, -0.5427,  ...,  1.9971,  0.1442,  0.3059]],
        grad_fn=<EmbeddingBackward0>),
 tensor([[-1.4118, -0.5453, -0.3487],
         [-1.4118, -0.5453, -0.3487],
         [-1.4118, -0.5453, -0.3487],
         ...,
         [-1.4118, -0.5453, -0.3487],
         [-1.4118, -0.5453, -0.3487],
         [-1.4118, -0.5453, -0.3487]], grad_fn=<EmbeddingBackward0>),
 tensor([[1.2303],
         [1.2303],
         [1.2303],
         ...,
         [1.2303],
         [1.2303],
         [1.2303]], grad_fn=<EmbeddingBackward0>),
 tensor([[0.2762, 0.2792],
         [0.2762, 0.2792],
         [0.4538, 0.2937],
     

In [29]:
z = torch.cat(embedding_val, 1)
z

tensor([[-0.5685, -1.1374, -0.6512,  ...,  1.2303,  0.2762,  0.2792],
        [-1.2457, -0.8277, -0.5427,  ...,  1.2303,  0.2762,  0.2792],
        [-0.5685, -1.1374, -0.6512,  ...,  1.2303,  0.4538,  0.2937],
        ...,
        [ 3.5337,  0.3586, -0.1212,  ...,  1.2303,  0.2762,  0.2792],
        [-1.2457, -0.8277, -0.5427,  ...,  1.2303,  0.2762,  0.2792],
        [-1.2457, -0.8277, -0.5427,  ...,  1.2303,  0.2762,  0.2792]],
       grad_fn=<CatBackward0>)

In [30]:
dropout = nn.Dropout(.4)

In [31]:
final_embed = dropout(z)
final_embed

tensor([[-0.0000, -1.8957, -1.0853,  ...,  2.0505,  0.4604,  0.4653],
        [-0.0000, -0.0000, -0.9046,  ...,  0.0000,  0.4604,  0.0000],
        [-0.9475, -0.0000, -1.0853,  ...,  2.0505,  0.0000,  0.4895],
        ...,
        [ 0.0000,  0.0000, -0.2021,  ...,  0.0000,  0.4604,  0.0000],
        [-2.0762, -0.0000, -0.9046,  ...,  0.0000,  0.4604,  0.0000],
        [-2.0762, -1.3795, -0.0000,  ...,  0.0000,  0.4604,  0.4653]],
       grad_fn=<MulBackward0>)

In [32]:
import torch 
import torch.nn as nn
import torch.functional as F

In [153]:
class FeedForwardNN(nn.Module):
    
    def __init__(self, embedding_dim, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(inp, out) for inp, out in embedding_dim])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((out for imp, out in embedding_dim))
        n_in = n_emb + n_cont
    
        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1], out_sz))
        self.layers = nn.Sequential(*layerlist)
        
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

In [154]:
embedding_dim

[(15, 8), (5, 3), (2, 1), (4, 2)]

In [155]:
len(cont_features)

5

In [156]:
torch.manual_seed(100)
model = FeedForwardNN(embedding_dim, len(cont_features), 1, [100, 50], p = 0.4)

In [157]:
model

FeedForwardNN(
  (embeds): ModuleList(
    (0): Embedding(15, 8)
    (1): Embedding(5, 3)
    (2): Embedding(2, 1)
    (3): Embedding(4, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=19, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [158]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [159]:
data.shape

(1201, 10)

In [160]:
cont_values

tensor([[   65.,  8450.,   856.,   854.,    20.],
        [   80.,  9600.,  1262.,     0.,    47.],
        [   68., 11250.,   920.,   866.,    22.],
        ...,
        [   66.,  9042.,  1188.,  1152.,    82.],
        [   68.,  9717.,  1078.,     0.,    73.],
        [   75.,  9937.,  1256.,     0.,    58.]])

In [161]:
cont_values.shape

torch.Size([1201, 5])

In [162]:
# :::: TRAIN TEST SPLIT :::::
batch_size = 1200
test_size = int(batch_size * 0.15)

train_cat = cat_features[:batch_size - test_size]
test_cat = cat_features[batch_size - test_size:batch_size]

train_cont = cont_values[:batch_size - test_size]
test_cont = cont_values[batch_size - test_size : batch_size]

y_train = y[:batch_size - test_size]
y_test = y[batch_size - test_size : batch_size]

In [163]:
print(len(train_cat))
print(len(test_cat))

print(len(train_cont))
print(len(test_cont))

print(len(y_train))
print(len(y_test))

1020
180
1020
180
1020
180


In [164]:
print(train_cat.shape)
print(test_cat.shape)
print(train_cont.shape)
print(test_cont.shape)
print(y_train.shape)
print(y_test.shape)

torch.Size([1020, 4])
torch.Size([180, 4])
torch.Size([1020, 5])
torch.Size([180, 5])
torch.Size([1020, 1])
torch.Size([180, 1])


In [165]:
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_cat,train_cont)
    loss=torch.sqrt(loss_function(y_pred,y_train)) ### RMSE
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss : 200496.765625
Epoch number: 11 and the loss : 200493.578125
Epoch number: 21 and the loss : 200489.125
Epoch number: 31 and the loss : 200482.546875
Epoch number: 41 and the loss : 200473.21875
Epoch number: 51 and the loss : 200462.140625
Epoch number: 61 and the loss : 200447.328125
Epoch number: 71 and the loss : 200429.125
Epoch number: 81 and the loss : 200409.15625
Epoch number: 91 and the loss : 200382.0625
Epoch number: 101 and the loss : 200354.703125
Epoch number: 111 and the loss : 200322.890625
Epoch number: 121 and the loss : 200290.21875
Epoch number: 131 and the loss : 200254.078125
Epoch number: 141 and the loss : 200212.8125
Epoch number: 151 and the loss : 200162.34375
Epoch number: 161 and the loss : 200115.9375
Epoch number: 171 and the loss : 200057.453125
Epoch number: 181 and the loss : 200002.046875
Epoch number: 191 and the loss : 199942.40625
Epoch number: 201 and the loss : 199877.28125
Epoch number: 211 and the loss : 199809.39