In [1]:
# Load packages
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import shared_functions as sf
from matplotlib import pyplot as plt

In [2]:
# Define file path and file name
import_path = '../data/processed/'
file_name = 'property-sales_new-york-city_2022_processed'

In [3]:
# Defining model name
model_name = 'basicModel'

# Creating output directory for exports
Path(f'../models/{model_name}').mkdir(parents=True, exist_ok=True)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type
print(f'Device type: {device.upper()}')

Device type: CPU


In [5]:
# Set random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [6]:
# Load data
df = pd.read_parquet(f'{import_path}{file_name}.parquet').iloc[:, :9]

In [7]:
# Create input and output arrays from dataset
y = df.sale_price
X = df.drop(columns=y.name)

In [8]:
dataset_labels = ['train', 'val', 'test']

In [9]:
# Split data into training, validation and testing sets
# make this more agnostic
X_split, y_split = sf.train_test_val_dict(X, y, val_size=0.1, test_size=0.2, shuffle=True, random_state=42, verbose=True)

Array       Length (n)  Length (%)
----------------------------------
training         18589      70.00%
validation        2655      10.00%
testing           5310      20.00%
----------------------------------
total            26554     100.00%


In [10]:
# Standardize input arrays (this also ensures all values are numeric)
scaler = StandardScaler()
scaler.fit(X_split['train'])
X_scaled = {key:scaler.transform(X_array) for key, X_array in zip(dataset_labels, [X_split[key] for key in X_split])}

In [11]:
# Create X tensors
X_tensors = {key:torch.Tensor(X_array) for key, X_array in zip(dataset_labels, [X_scaled[key] for key in X_scaled])}

In [12]:
# Create tensor datasets
tensor_datasets = {key:torch.utils.data.TensorDataset(X_array, torch.Tensor(y_array.values)) for key, X_array, y_array in zip(dataset_labels, [X_tensors[key] for key in X_tensors], [y_split[key] for key in y_split])}

In [13]:
# Define model architecture
class basicModel(nn.Module):
    # Define model components
    def __init__(self):
        # Inherit from parent class
        super(basicModel, self).__init__()

        # Define linear layers
        # self.linear1 = nn.Linear(8, 200)
        # self.linear2 = nn.Linear(200, 100)
        self.linear3 = nn.Linear(8, 1)

        # Define acitvation function
        self.relu = nn.ReLU(inplace=True)
    
    # Define forward pass
    def forward(self, X):
        # X = self.relu(self.linear1(X))
        # X = self.relu(self.linear2(X))
        y = self.relu(self.linear3(X))
        return y

In [14]:
# Instantiate model
model = basicModel()

In [15]:
# Calculate number of model parameters
n_params = sum(parameter.numel() for parameter in model.parameters())
print(f'# model paramters: {n_params}')

# model paramters: 9


In [16]:
# Pass model to GPU
model.to(device)

basicModel(
  (linear3): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU(inplace=True)
)

In [17]:
# Define loss function
loss_function = nn.MSELoss().to(device)

In [18]:
# Define optimization alogrithm
optimizer = optim.Adam(params=model.parameters(), lr=.01)

In [19]:
# Define batch size
batch_size = 64

In [20]:
# Instantiate data loaders
# maybe replace X_split with tensor_dataset or sth
dataloaders = {key:torch.utils.data.DataLoader(tensor_dataset, batch_size=batch_size if key=='train' else len(X_split[key]), shuffle=False) for key, tensor_dataset in zip(dataset_labels, [tensor_datasets[key] for key in tensor_datasets])}

In [21]:
# Define number of epochs
epochs = 10

In [22]:
history = pd.DataFrame({
    'loss_train': np.nan,
    'loss_val': np.nan
    }, index=pd.Index(np.arange(epochs) + 1, name='epoch'))

# Training loop
for epoch in range(epochs + 1):
    # Train model
    model.train()
    epoch_loss_train = []

    for i, (batch_X, batch_y) in enumerate(dataloaders['train']):
        # Pass batch to GPU
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        output_train = model(batch_X)
        model.zero_grad()
        batch_loss_train = loss_function(output_train, batch_y)

        # Backward pass
        batch_loss_train.backward()
        optimizer.step()

        epoch_loss_train.append(batch_loss_train.data.item())
    
    # Calculate training loss
    epoch_loss_train = np.mean(epoch_loss_train)
    history.loss_train.loc[epoch] = epoch_loss_train.item()

    # Calculate validation loss
    model.eval() # deactivates potential Dropout and BatchNorm
    with torch.no_grad():
        output_val = model(batch_X) # this cannot also be batch_X needs to be dataloader['val'] or sth
        epoch_loss_val = loss_function(output_val, batch_y)
        history.loss_val.loc[epoch] = epoch_loss_val.item()
    
    print(f'Epoch {epoch}/{epochs}, Training loss: {epoch_loss_train.item()}, Validation loss: {epoch_loss_val.item()}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 0/10, Training loss: 299154075465692.8, Validation loss: 4926047518720.0
Epoch 1/10, Training loss: 299154062219538.5, Validation loss: 4926042800128.0
Epoch 2/10, Training loss: 299154050623269.8, Validation loss: 4926038081536.0
Epoch 3/10, Training loss: 299154039235545.3, Validation loss: 4926033362944.0
Epoch 4/10, Training loss: 299154029332177.4, Validation loss: 4926028120064.0
Epoch 5/10, Training loss: 299154015889415.06, Validation loss: 4926022877184.0
Epoch 6/10, Training loss: 299154005436085.25, Validation loss: 4926018158592.0
Epoch 7/10, Training loss: 299153997106932.56, Validation loss: 4926012915712.0
Epoch 8/10, Training loss: 299153985989684.8, Validation loss: 4926008197120.0
Epoch 9/10, Training loss: 299153977014630.94, Validation loss: 4926002954240.0
Epoch 10/10, Training loss: 299153964991590.06, Validation loss: 4925998235648.0


In [23]:
history.to_csv(f'../models/{model_name}/history.csv')

In [24]:
# Generate model predictions
predictions = sf.get_predictions(model, X_tensors, y_split, dataset_labels, save_as=f'../models/{model_name}/predictions.csv')

In [25]:
# Compute performance metrics
metrics = sf.calc_metrics(predictions, save_as=f'../models/{model_name}/metrics.csv')

Dataset      CPE          RMSE          MAE      R2
---------------------------------------------------
train      1.000  17312063.083  2159515.885  -0.016
val        1.000  10087510.242  1999090.908  -0.041
test       1.000  19788940.085  2226796.547  -0.013
