In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [2]:
class HousingPriceDataset(Dataset):
    """Housing price dataset."""

    def __init__(self, csv_file, train=True):
        """
        Arguments:
            csv_file (string): Path to the csv file with housing price data.
        """
        self.train = train
        self.df = pd.read_csv('housing_price_dataset.csv')
        # print(f'Missing values:\n{self.df.isna().sum()}')
        self.ydf = self.df[['Price']]
        self.Xdf = self.df.drop(columns=['Price'])
        # print(f'\nShapes:\nX: {self.Xdf.shape}\ny: {self.ydf.shape}')
        self.Xdf_tr, self.Xdf_te, self.ydf_tr, self.ydf_te = train_test_split(self.Xdf, self.ydf, test_size=0.1, random_state=42)
        # print(f'Shapes:\nXtr: {self.Xdf_tr.shape}\nXte: {self.Xdf_te.shape}\nytr: {self.ydf_tr.shape}\nyte: {self.ydf_te.shape}')
        self.preprocessor = ColumnTransformer(
            [("Categorical", OneHotEncoder(), ['Bedrooms', 'Bathrooms', 'Neighborhood']),
            ("Numerical", StandardScaler(), ['SquareFeet', 'YearBuilt'])]
        )
        self.target_scaler = StandardScaler()

        self.Xtr = torch.Tensor(self.preprocessor.fit_transform(self.Xdf_tr))
        self.Xte = torch.Tensor(self.preprocessor.transform(self.Xdf_te))
        self.ytr = torch.Tensor(self.target_scaler.fit_transform(self.ydf_tr))
        self.yte = torch.Tensor(self.target_scaler.transform(self.ydf_te))

        # print(f'Shapes:\nXtr: {self.Xtr.shape}\nXte: {self.Xte.shape}\nytr: {self.ytr.shape}\nyte: {self.yte.shape}')

    def __len__(self):
        if self.train:
            return len(self.Xtr)
        else:
            return len(self.Xte)

    def __getitem__(self, idx):
        if self.train:
            return (self.Xtr[idx], self.ytr[idx])
        else:
            return (self.Xte[idx], self.yte[idx])

In [3]:
training_data = HousingPriceDataset(csv_file='housing_price_dataset.csv', train=True)
test_data = HousingPriceDataset(csv_file='housing_price_dataset.csv', train=False)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=True)

In [4]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(12, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
loss_fn = nn.HuberLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [5]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        """
        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")"""

In [6]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
    test_loss /= num_batches
    return test_loss

In [7]:
epochs = 100
for t in range(epochs):
    train(train_dataloader, model, loss_fn, optimizer)
    loss = test(test_dataloader, model, loss_fn)
    print(f"Epoch {t+1} avg loss: {loss:>8f}")
print("Done!")

Epoch 1 avg loss: 0.210851
Epoch 2 avg loss: 0.205750
Epoch 3 avg loss: 0.204906
Epoch 4 avg loss: 0.204557
Epoch 5 avg loss: 0.204444
Epoch 6 avg loss: 0.204215
Epoch 7 avg loss: 0.204116
Epoch 8 avg loss: 0.204286
Epoch 9 avg loss: 0.203951
Epoch 10 avg loss: 0.203993
Epoch 11 avg loss: 0.203856
Epoch 12 avg loss: 0.203883
Epoch 13 avg loss: 0.203881
Epoch 14 avg loss: 0.203792
Epoch 15 avg loss: 0.204088
Epoch 16 avg loss: 0.203905
Epoch 17 avg loss: 0.203795
Epoch 18 avg loss: 0.203828
Epoch 19 avg loss: 0.203808
Epoch 20 avg loss: 0.203741
Epoch 21 avg loss: 0.203858
Epoch 22 avg loss: 0.203714
Epoch 23 avg loss: 0.203791
Epoch 24 avg loss: 0.203821
Epoch 25 avg loss: 0.203724
Epoch 26 avg loss: 0.203822
Epoch 27 avg loss: 0.203639
Epoch 28 avg loss: 0.203907
Epoch 29 avg loss: 0.204171
Epoch 30 avg loss: 0.203773
Epoch 31 avg loss: 0.203688
Epoch 32 avg loss: 0.203685
Epoch 33 avg loss: 0.203694


KeyboardInterrupt: 