In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [2]:
class HousingPriceDataset(Dataset):
    """Housing price dataset."""

    def __init__(self, csv_file, train=True):
        """
        Arguments:
            csv_file (string): Path to the csv file with housing price data.
        """
        self.train = train
        self.df = pd.read_csv('housing_price_dataset.csv')
        # print(f'Missing values:\n{self.df.isna().sum()}')
        self.ydf = self.df[['Price']]
        self.Xdf = self.df.drop(columns=['Price'])
        # print(f'\nShapes:\nX: {self.Xdf.shape}\ny: {self.ydf.shape}')
        self.Xdf_tr, self.Xdf_te, self.ydf_tr, self.ydf_te = train_test_split(self.Xdf, self.ydf, test_size=0.1, random_state=42)
        # print(f'Shapes:\nXtr: {self.Xdf_tr.shape}\nXte: {self.Xdf_te.shape}\nytr: {self.ydf_tr.shape}\nyte: {self.ydf_te.shape}')
        self.preprocessor = ColumnTransformer(
            [("Categorical", OneHotEncoder(), ['Bedrooms', 'Bathrooms', 'Neighborhood']),
            ("Numerical", StandardScaler(), ['SquareFeet', 'YearBuilt'])]
        )
        self.target_scaler = StandardScaler()

        self.Xtr = torch.Tensor(self.preprocessor.fit_transform(self.Xdf_tr))
        self.Xte = torch.Tensor(self.preprocessor.transform(self.Xdf_te))
        self.ytr = torch.Tensor(self.target_scaler.fit_transform(self.ydf_tr))
        self.yte = torch.Tensor(self.target_scaler.transform(self.ydf_te))

        # print(f'Shapes:\nXtr: {self.Xtr.shape}\nXte: {self.Xte.shape}\nytr: {self.ytr.shape}\nyte: {self.yte.shape}')

    def __len__(self):
        if self.train:
            return len(self.Xtr)
        else:
            return len(self.Xte)

    def __getitem__(self, idx):
        if self.train:
            return (self.Xtr[idx], self.ytr[idx])
        else:
            return (self.Xte[idx], self.yte[idx])

In [3]:
training_data = HousingPriceDataset(csv_file='housing_price_dataset.csv', train=True)
test_data = HousingPriceDataset(csv_file='housing_price_dataset.csv', train=False)
train_dataloader = DataLoader(training_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=True)

In [4]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(12, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
loss_fn = nn.HuberLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=3e-4)

In [5]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    train_loss = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    train_loss /= batch
    return train_loss

In [6]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
    test_loss /= num_batches
    return test_loss

In [7]:
epochs = 10
for t in range(epochs):
    train_loss = train(train_dataloader, model, loss_fn, optimizer)
    test_loss = test(test_dataloader, model, loss_fn)
    print(f"Epoch {t+1}: avg train loss: {train_loss:>8f}, avg test loss: {test_loss:>8f}")
print("Done!")

Epoch 1: avg train loss: 0.429303, avg test loss: 0.428832
Epoch 2: avg train loss: 0.414921, avg test loss: 0.402865
Epoch 3: avg train loss: 0.376632, avg test loss: 0.330065
Epoch 4: avg train loss: 0.287327, avg test loss: 0.222410
Epoch 5: avg train loss: 0.234537, avg test loss: 0.208514
Epoch 6: avg train loss: 0.230088, avg test loss: 0.208316
Epoch 7: avg train loss: 0.229532, avg test loss: 0.208184
Epoch 8: avg train loss: 0.229876, avg test loss: 0.208233
Epoch 9: avg train loss: 0.227550, avg test loss: 0.208027
Epoch 10: avg train loss: 0.227117, avg test loss: 0.208013
Done!
