In [358]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Wczytanie i preprocessing danych

In [359]:
house_data = pd.read_csv("train_data.csv")

print(house_data.N_elevators.describe())

count    4124.000000
mean       11.055771
std         7.717030
min         0.000000
25%         5.000000
50%        11.000000
75%        16.000000
max        27.000000
Name: N_elevators, dtype: float64


In [None]:
house_data = house_data.drop(columns=[])
house_data.HeatingType = (house_data.HeatingType == "individual_heating").astype(int)
house_data.AptManageType = (house_data.AptManageType == "management_in_trust").astype(int)

categorical_columns = house_data.select_dtypes(include=['object']).columns
house_data = pd.get_dummies(house_data, columns=categorical_columns)

# We have 3 classes - 1. SalePrice < 100000, 2. 1000000 <= SalePrice < 350000, 3. SalePrice >= 350000
house_data['SalePrice'] = pd.cut(house_data['SalePrice'], bins=[0, 100000, 350000, 1000000], labels=[0, 1, 2])
house_data = pd.get_dummies(house_data, columns=['SalePrice'])

# -> int
house_data = house_data.astype(int)

house_data.head()

Unnamed: 0,YearBuilt,Size(sqf),Floor,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),N_manager,N_elevators,N_FacilitiesInApt,...,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam,SubwayStation_no_subway_nearby,SalePrice_0,SalePrice_1,SalePrice_2
0,2006,814,3,1,1,111,184,3,0,5,...,0,0,0,1,0,0,0,0,1,0
1,1985,587,8,1,0,80,76,2,2,3,...,0,0,1,0,0,0,0,1,0,0
2,1985,587,6,1,0,80,76,2,2,3,...,0,0,1,0,0,0,0,1,0,0
3,2006,2056,8,1,1,249,536,5,11,5,...,0,0,0,0,0,1,0,0,0,1
4,1992,644,2,1,0,142,79,4,8,3,...,0,0,0,0,1,0,0,1,0,0


## Niezbalansowanie danych

In [361]:
# Print number of classes
print(house_data['SalePrice_0'].sum())
print(house_data['SalePrice_1'].sum())
print(house_data['SalePrice_2'].sum())

562
2992
570


## Przygotowanie zbiorów

In [362]:
train = house_data.sample(frac=0.8, random_state=200)  # random state is a seed value
test = house_data.drop(train.index)

# Shape
print(train.dtypes)

YearBuilt                              int64
Size(sqf)                              int64
Floor                                  int64
HeatingType                            int64
AptManageType                          int64
N_Parkinglot(Ground)                   int64
N_Parkinglot(Basement)                 int64
N_manager                              int64
N_elevators                            int64
N_FacilitiesInApt                      int64
N_FacilitiesNearBy(Total)              int64
N_SchoolNearBy(Total)                  int64
HallwayType_corridor                   int64
HallwayType_mixed                      int64
HallwayType_terraced                   int64
TimeToBusStop_0~5min                   int64
TimeToBusStop_10min~15min              int64
TimeToBusStop_5min~10min               int64
TimeToSubway_0-5min                    int64
TimeToSubway_10min~15min               int64
TimeToSubway_15min~20min               int64
TimeToSubway_5min~10min                int64
TimeToSubw

In [None]:
train_y = train[['SalePrice_0', 'SalePrice_1', 'SalePrice_2']]
train_x = train.drop(columns=['SalePrice_0', 'SalePrice_1', 'SalePrice_2'])
test_y = test[['SalePrice_0', 'SalePrice_1', 'SalePrice_2']]
test_x = test.drop(columns=['SalePrice_0', 'SalePrice_1', 'SalePrice_2'])

# Normalize data
train_x = (train_x - train_x.mean()) / train_x.std()
test_x = (test_x - test_x.mean()) / test_x.std()

In [364]:
import torch
import numpy as np
import torch.utils.data as data
import torch.nn as nn

train_dataset = data.TensorDataset(
    torch.from_numpy(train_x.values), torch.from_numpy(train_y.values)
)
test_dataset = data.TensorDataset(
    torch.from_numpy(test_x.values), torch.from_numpy(test_y.values)
)

next(iter(train_dataset))

(tensor([ 1.2634, -0.1147,  0.4035,  0.2167,  0.2580, -0.4882,  1.4904,  0.2257,
          1.1610,  1.3865,  1.1980,  1.3756, -0.3510, -0.6462,  0.8238,  0.5536,
         -0.0958, -0.5395,  1.0589, -0.3819, -0.4482, -0.4924, -0.2009, -0.3819,
         -0.3876, -0.1395, -0.1202, -0.6091,  1.6798, -0.3521, -0.2660],
        dtype=torch.float64),
 tensor([0, 0, 1]))

## Sieć neuronowa

In [365]:
import torch.nn as nn
import torch.nn.functional as F

# GPU operations have a separate seed we also want to set
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

# Additionally, some operations on a GPU are implemented stochastic for efficiency
# We want to ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


class HouseNet(nn.Module):
    def __init__(self):
        super(HouseNet, self).__init__()
        self.lin1 =nn.Linear(31, 48)
        self.bn1 = nn.BatchNorm1d(48)
        self.act1 =nn.ReLU()
        self.lin2 =nn.Linear(48, 48)
        self.bn2 = nn.BatchNorm1d(48)
        self.act2 =nn.ReLU()
        self.lin3 =nn.Linear(48, 3)

    def forward(self, x):
        x = self.lin1(x)
        x = self.bn1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.bn2(x)
        x = self.act2(x)
        x = self.lin3(x)
        return x

In [366]:
# Prepare model
device = torch.device("cpu")
model = HouseNet()
model.to(device)

HouseNet(
  (lin1): Linear(in_features=31, out_features=48, bias=True)
  (bn1): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU()
  (lin2): Linear(in_features=48, out_features=48, bias=True)
  (bn2): BatchNorm1d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act2): ReLU()
  (lin3): Linear(in_features=48, out_features=3, bias=True)
)

In [367]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_module = nn.MSELoss()

train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_data_loader = data.DataLoader(
    test_dataset, batch_size=len(test_dataset), shuffle=True, drop_last=False
)

## Trenowanie modelu

In [368]:
# Train model
model.train()

# Training loop
for epoch in range(200):
    for data_inputs, data_labels in train_data_loader:
        ## Step 1: Move input data to device (only strictly necessary if we use GPU)
        data_inputs = data_inputs.to(device)
        data_labels = data_labels.to(device)

        ## Step 2: Run the model on the input data
        preds = model(data_inputs.float())
        preds = preds.squeeze(
            dim=1
        )  # Output is [Batch size, 1], but we want [Batch size]

        ## Step 3: Calculate the loss
        loss = loss_module(preds, data_labels.float())

        ## Step 4: Perform backpropagation
        # Before calculating the gradients, we need to ensure that they are all zero.
        # The gradients would not be overwritten, but actually added to the existing ones.
        optimizer.zero_grad()
        # Perform backpropagation
        loss.backward()

        ## Step 5: Update the parameters
        optimizer.step()
    print(f"Epoch: {epoch+1}, loss: {loss.item():.3}")

Epoch: 1, loss: 0.115
Epoch: 2, loss: 0.0766
Epoch: 3, loss: 0.0919
Epoch: 4, loss: 0.0795
Epoch: 5, loss: 0.0875
Epoch: 6, loss: 0.0697
Epoch: 7, loss: 0.0798
Epoch: 8, loss: 0.0472
Epoch: 9, loss: 0.0848
Epoch: 10, loss: 0.0712
Epoch: 11, loss: 0.0779
Epoch: 12, loss: 0.094
Epoch: 13, loss: 0.0623
Epoch: 14, loss: 0.0934
Epoch: 15, loss: 0.0794
Epoch: 16, loss: 0.0655
Epoch: 17, loss: 0.0655
Epoch: 18, loss: 0.0903
Epoch: 19, loss: 0.0544
Epoch: 20, loss: 0.0655
Epoch: 21, loss: 0.0683
Epoch: 22, loss: 0.07
Epoch: 23, loss: 0.0692
Epoch: 24, loss: 0.0798
Epoch: 25, loss: 0.0673
Epoch: 26, loss: 0.0567
Epoch: 27, loss: 0.0494
Epoch: 28, loss: 0.0531
Epoch: 29, loss: 0.0638
Epoch: 30, loss: 0.0808
Epoch: 31, loss: 0.0826
Epoch: 32, loss: 0.0653
Epoch: 33, loss: 0.0653
Epoch: 34, loss: 0.0762
Epoch: 35, loss: 0.0659
Epoch: 36, loss: 0.0655
Epoch: 37, loss: 0.0678
Epoch: 38, loss: 0.0576
Epoch: 39, loss: 0.063
Epoch: 40, loss: 0.053
Epoch: 41, loss: 0.0595
Epoch: 42, loss: 0.0611
Epoch: 

## Testowanie modelu

In [None]:
model.eval()

for data_inputs, data_labels in test_data_loader:
    data_inputs = data_inputs.to(device)
    data_labels = data_labels.to(device)

    preds = model(data_inputs.float())
    preds = preds.squeeze(dim=1)

    # round to int
    preds = torch.round(preds)

    predicted = 0
    for i in range(len(data_labels)):
        if data_labels[i][0] == preds[i][0] and data_labels[i][1] == preds[i][1] and data_labels[i][2] == preds[i][2]:
            predicted += 1
    
    print(f"Accuracy: {predicted/len(data_labels)}")

Accuracy: 0.8703030303030304
