In [1]:
# Load packages
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import shared_functions as sf
from matplotlib import pyplot as plt

In [2]:
# Define file path and file name
import_path = '../data/processed/'
file_name = 'property-sales_new-york-city_2022_processed'

In [3]:
# Defining model name
model_path = '../models/'
model_name = 'basicModel'

# Creating output directory for exports
Path(f'{model_path}{model_name}').mkdir(parents=True, exist_ok=True)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu').type
print(f'Device type: {device.upper()}')

Device type: CPU


In [5]:
# Set random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [6]:
# Load data
df = pd.read_parquet(f'{import_path}{file_name}.parquet').iloc[:, :9]

In [7]:
# Create input and output arrays from dataset
y = df.sale_price
X = df.drop(columns=y.name)

In [8]:
# Split data into training, validation and testing sets
X_split, y_split, subset_keys = sf.train_test_val_dict(X, y, val_size=0.1, test_size=0.2, shuffle=True, random_state=42, save_subset_sizes_as='../data/processed/subset_sizes.csv')

Subset  Size (n)  Size (%)
--------------------------
train      18589    70.00%
val         2655    10.00%
test        5310    20.00%
--------------------------
total      26554   100.00%


In [9]:
# Standardize input arrays (this also ensures all values are numeric)
scaler = StandardScaler()
scaler.fit(X_split['train'])
X_scaled = {subset_key:scaler.transform(X_array) for subset_key, X_array in zip(subset_keys, [X_split[subset_key] for subset_key in subset_keys])}

In [10]:
# Standardize input arrays (this also ensures all values are numeric)
scaler = StandardScaler()
scaler.fit(pd.DataFrame(y_split['train']))
y_scaled = {subset_key:scaler.transform(y_array) for subset_key, y_array in zip(subset_keys, [pd.DataFrame(y_split[subset_key]) for subset_key in subset_keys])}

In [11]:
# Create X tensors
X_tensors = {subset_key:torch.Tensor(X_array) for subset_key, X_array in zip(subset_keys, [X_scaled[subset_key] for subset_key in subset_keys])}

In [12]:
# Create tensor datasets
tensor_datasets = {subset_key:torch.utils.data.TensorDataset(X_array, torch.Tensor(y_array)) for subset_key, X_array, y_array in zip(subset_keys, [X_tensors[subset_key] for subset_key in subset_keys], [y_scaled[subset_key] for subset_key in subset_keys])}

In [13]:
# Define model architecture
class basicModel(nn.Module):
    # Define model components
    def __init__(self):
        # Inherit from parent class
        super(basicModel, self).__init__()

        # Define linear layers
        # self.linear1 = nn.Linear(8, 200)
        # self.linear2 = nn.Linear(200, 100)
        self.linear3 = nn.Linear(8, 1)

        # Define acitvation function
        self.relu = nn.ReLU(inplace=True)
    
    # Define forward pass
    def forward(self, X):
        # X = self.relu(self.linear1(X))
        # X = self.relu(self.linear2(X))
        y = self.relu(self.linear3(X))
        return y

In [14]:
# Instantiate model
model = basicModel()

In [15]:
# Calculate number of model parameters
n_params = sum(parameter.numel() for parameter in model.parameters())
print(f'# model paramters: {n_params}')

# model paramters: 9


In [16]:
# Pass model to GPU
model.to(device)

basicModel(
  (linear3): Linear(in_features=8, out_features=1, bias=True)
  (relu): ReLU(inplace=True)
)

In [17]:
# Define loss function
loss_function = nn.MSELoss().to(device)

In [18]:
# Define optimization alogrithm
optimizer = optim.Adam(params=model.parameters(), lr=.01)

In [19]:
# Define batch size
batch_size = 64

In [20]:
# Instantiate data loaders
dataloaders = {subset_key:torch.utils.data.DataLoader(tensor_dataset, batch_size=batch_size if subset_key=='train' else len(tensor_datasets[subset_key]), shuffle=False) for subset_key, tensor_dataset in zip(subset_keys, [tensor_datasets[subset_key] for subset_key in subset_keys])}

In [21]:
# Define number of epochs
epochs = 3

In [22]:
history = pd.DataFrame({
    'loss_train': np.nan,
    'loss_val': np.nan
    }, index=pd.Index(np.arange(epochs) + 1, name='epoch'))

# Training loop
for epoch in np.arange(epochs) + 1:
    # Train model
    model.train()
    epoch_loss_train = []

    for i, (batch_X, batch_y) in enumerate(dataloaders['train']):
        # Pass batch to GPU
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)

        # Forward pass
        output_train = model(batch_X)
        model.zero_grad()
        batch_loss_train = loss_function(output_train, batch_y)

        # Backward pass
        batch_loss_train.backward()
        optimizer.step()

        epoch_loss_train.append(batch_loss_train.data.item())
    
    # Calculate training loss
    epoch_loss_train = np.mean(epoch_loss_train)
    history.loss_train.loc[epoch] = epoch_loss_train.item()

    # Calculate validation loss
    model.eval() # deactivates potential Dropout and BatchNorm
    with torch.no_grad():
        output_val = model(batch_X) # this cannot also be batch_X needs to be dataloader['val'] or sth
        epoch_loss_val = loss_function(output_val, batch_y)
        history.loss_val.loc[epoch] = epoch_loss_val.item()
    
    print(f'Epoch {epoch}/{epochs}, Training loss: {epoch_loss_train.item()}, Validation loss: {epoch_loss_val.item()}')

Epoch 1/3, Training loss: 0.8952365433397188, Validation loss: 0.00945769902318716
Epoch 2/3, Training loss: 0.7363498403748363, Validation loss: 0.007746731862425804
Epoch 3/3, Training loss: 0.7032218395389899, Validation loss: 0.0067726243287324905


In [23]:
# Save training history
history.to_csv(f'{model_path}{model_name}/history.csv')

In [24]:
# Generate model predictions
predictions = sf.get_predictions(model, X_tensors, y_split, subset_keys, save_as=f'{model_path}{model_name}/predictions.csv')

In [25]:
# Compute performance metrics
metrics = sf.get_metrics(predictions, subset_keys, save_as=f'{model_path}{model_name}/perf_metrics.csv')

Subset    CPE          RMSE          MAE      R2
------------------------------------------------
train   1.000  17312078.364  2159542.609  -0.016
val     1.000  10087543.662  1999118.037  -0.041
test    1.000  19788955.801  2226823.996  -0.013
------------------------------------------------
total   1.000  17271139.553  2156956.819  -0.016
