In [1]:
%load_ext autoreload
%autoreload 2

import os; import sys; from pathlib import Path
root = Path(os.getcwd()).parents[0]
sys.path.append(str(root))


In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model 
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
import pytorch_lightning as pl
from src.model import Regressor


In [3]:
# load data

X_train = pd.read_csv('../data/X_train.csv')
X_val = pd.read_csv('../data/X_val.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_val = pd.read_csv('../data/y_val.csv')
y_test = pd.read_csv('../data/y_test.csv')


In [4]:
# simple model, baseline 

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

y_pred_val = regr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'RMSE val: {rmse_val:.2f}')

y_pred_test = regr.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'RMSE test: {rmse_test:.2f}')


RMSE val: 8.30
RMSE test: 8.15


In [5]:
# hot DL model
# ref: 
    # https://pytorch-lightning.readthedocs.io/en/latest/starter/introduction_guide.html
    # https://colab.research.google.com/github/PytorchLightning/pytorch-lightning/blob/master/notebooks/01-mnist-hello-world.ipynb#scrollTo=PA151FkLtprO


In [6]:
# prepare data loaders

train_dataset = TensorDataset(
    torch.tensor(X_train.values).float(),
    torch.tensor(y_train.values).float()
)

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=128,
    num_workers=0,
    shuffle=True
)

val_dataset = TensorDataset(
    torch.tensor(X_val.values).float(),
    torch.tensor(y_val.values).float()
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=128,
    num_workers=0,
    shuffle=False
)


In [7]:
# prepare training

# inspect by tensorboard --logdir ./tb_logs in terminal
logger = pl.loggers.TensorBoardLogger(
    save_dir='../tb_logs',
    name='regressor',
    default_hp_metric=False,

)

checkpoint_callback = ModelCheckpoint(
    monitor='avg_rmse_val',
    mode='min',
    dirpath='../model_checkpoints',
    filename='{epoch}-{avg_rmse_val:.2f}',
    save_top_k=1,
    verbose=True,
)

trainer = pl.Trainer(
    fast_dev_run=False,
    max_epochs=100,
    logger=logger,
    callbacks=[checkpoint_callback],
    progress_bar_refresh_rate=0
)  

model = Regressor()
print(model)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Regressor(
  (fc1): Linear(in_features=14, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
)


In [8]:
# train

trainer.fit(model, train_dataloader=train_loader, val_dataloaders=val_loader)


Missing logger folder: ../tb_logs/regressor

  | Name | Type   | Params
--------------------------------
0 | fc1  | Linear | 480   
1 | fc2  | Linear | 2.1 K 
2 | fc3  | Linear | 2.1 K 
3 | fc4  | Linear | 33    
--------------------------------
4.7 K     Trainable params
0         Non-trainable params
4.7 K     Total params
0.019     Total estimated model params size (MB)
Epoch 0, global step 42: avg_rmse_val reached 13.60997 (best 13.60997), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/epoch=0-avg_rmse_val=13.61.ckpt" as top 1
Epoch 1, global step 85: avg_rmse_val reached 8.45131 (best 8.45131), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/epoch=1-avg_rmse_val=8.45.ckpt" as top 1
Epoch 2, global step 128: avg_rmse_val reached 8.43949 (best 8.43949), saving model to "/Users/sebastian/Repos/housing_price_prediction/model_checkpoints/epoch=2-avg_rmse_val=8.44.ckpt" as top 1
Epoch 3, global step 171: avg_rmse_val

1

In [9]:
model.eval()

with torch.no_grad():
    pred_val = model(torch.tensor(X_val.values).float())
    pred_test = model(torch.tensor(X_test.values).float())

y_val['price_gain_pred'] = pred_val.numpy()
rmse_val = np.sqrt(mean_squared_error(y_val.price_gain, y_val.price_gain_pred))
    
y_test['price_gain_pred'] = pred_test.numpy()
rmse_test = np.sqrt(mean_squared_error(y_test.price_gain, y_test.price_gain_pred))

print(f'RMSE val: {rmse_val:.2f}')
print(f'RMSE test: {rmse_test:.2f}')


RMSE val: 7.88
RMSE test: 8.25
