In [1]:
%load_ext autoreload
%autoreload 2

import os; import sys; from pathlib import Path
root = Path(os.getcwd()).parents[0]
sys.path.append(str(root))


In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model 
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
import pytorch_lightning as pl
from src.model import Regressor


In [3]:
# load data

X_train = pd.read_csv('../data/X_train.csv')
X_val = pd.read_csv('../data/X_val.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_val = pd.read_csv('../data/y_val.csv')
y_test = pd.read_csv('../data/y_test.csv')


In [4]:
# simple model, baseline 

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred_val = regr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'RMSE val: {rmse_val:.2f}')

y_pred_test = regr.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'RMSE test: {rmse_test:.2f}')


RMSE val: 7.93
RMSE test: 12.03


In [5]:
# hot DL model
# ref: 
    # https://pytorch-lightning.readthedocs.io/en/latest/starter/introduction_guide.html
    # https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb#scrollTo=PA151FkLtprO


In [6]:
# prepare data loaders

train_dataset = TensorDataset(
    torch.tensor(X_train.values).float(),
    torch.tensor(y_train.values).float()
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=16,
    num_workers=4,
    shuffle=True
)

val_dataset = TensorDataset(
    torch.tensor(X_val.values).float(),
    torch.tensor(y_val.values).float()
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=16,
    num_workers=4,
    shuffle=False
)


In [7]:
# prepare training

logger = pl.loggers.TensorBoardLogger('../tb_logs', name='regressor')
# inspect by tensorboard --logdir ./tb_logs

checkpoint_callback = ModelCheckpoint(
    monitor='avg_rmse_val',
    mode='min',
    dirpath='../model_checkpoints',
    filename='{epoch}-{avg_rmse_val:.2f}',
    save_top_k=1,
    verbose=True,
)

trainer = pl.Trainer(
    fast_dev_run=False,
    max_epochs=100,
    logger=logger,
    callbacks=[checkpoint_callback],
    progress_bar_refresh_rate=0
)  

model = Regressor()
print(model)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Regressor(
  (fc1): Linear(in_features=14, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=256, bias=True)
  (fc4): Linear(in_features=256, out_features=256, bias=True)
  (fc5): Linear(in_features=256, out_features=1, bias=True)
)


In [8]:
# train

trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)



  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 1.9 K 
1 | fc2  | Linear | 33.0 K
2 | fc3  | Linear | 65.8 K
3 | fc4  | Linear | 65.8 K
4 | fc5  | Linear | 257   
--------------------------------
166 K     Trainable params
0         Non-trainable params
166 K     Total params
0.667     Total estimated model params size (MB)
Epoch 0, global step 7: avg_rmse_val reached 14.76496 (best 14.76496), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/epoch=0-avg_rmse_val=14.76.ckpt" as top 1
Epoch 1, global step 15: avg_rmse_val reached 8.12523 (best 8.12523), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/epoch=1-avg_rmse_val=8.13.ckpt" as top 1
Epoch 2, step 23: avg_rmse_val was not in top 1
Epoch 3, step 31: avg_rmse_val was not in top 1
Epoch 4, global step 39: avg_rmse_val reached 7.86453 (best 7.86453), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/ep

1

In [9]:
model.eval()

with torch.no_grad():
    pred_val = model(torch.tensor(X_val.values).float())
    pred_test = model(torch.tensor(X_test.values).float())

y_val['price_gain_pred'] = pred_val.numpy()
rmse_val = np.sqrt(mean_squared_error(y_val.price_gain, y_val.price_gain_pred))
    
y_test['price_gain_pred'] = pred_test.numpy()
rmse_test = np.sqrt(mean_squared_error(y_test.price_gain, y_test.price_gain_pred))

print(f'RMSE val: {rmse_val:.2f}')
print(f'RMSE test: {rmse_test:.2f}')


RMSE val: 8.25
RMSE test: 12.50


In [10]:
model_loaded = Regressor().load_from_checkpoint('../model_checkpoints/epoch=35-avg_rmse_val=7.43.ckpt')
model_loaded.eval()

with torch.no_grad():
    pred_val = model_loaded(torch.tensor(X_val.values).float())
    pred_test = model_loaded(torch.tensor(X_test.values).float())

y_val['price_gain_pred'] = pred_val.numpy()
rmse_val = np.sqrt(mean_squared_error(y_val.price_gain, y_val.price_gain_pred))
    
y_test['price_gain_pred'] = pred_test.numpy()
rmse_test = np.sqrt(mean_squared_error(y_test.price_gain, y_test.price_gain_pred))

print(f'RMSE val: {rmse_val:.2f}')
print(f'RMSE test: {rmse_test:.2f}')


RMSE val: 7.43
RMSE test: 11.04
