In [65]:
import pandas as pd 
import numpy as np 

import torch 
import torch.nn as nn 
import torch.optim as optim 
import mlflow 
import mlflow.pytorch 
import torchinfo
from torchinfo import summary 

from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error 

In [66]:
df = pd.read_parquet('data.parquet')
df['total_floor'] = df['total_floor'].fillna('-999') 
df['building_type'] = df['building_type'].fillna('missing')

In [67]:
def train_test_split_features(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=45) 
    return X_train, X_test, y_train, y_test


In [68]:
X = df.drop(columns=['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class'])
y_set = df[['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class']]

y_regression_7d = df[['request_day_within_7d']]
y_regression_3d = df[['request_day_within_3d']]

y_regression_3d_log = np.log1p(y_regression_3d) 
y_regression_7d_log = np.log1p(y_regression_7d) 



encoder_7d = LabelEncoder()
y_classification_7d = encoder_7d.fit_transform(df['7d_class'])


encoder_3d = LabelEncoder()
y_classification_3d = encoder_3d.fit_transform(df['3d_class'])

In [69]:
numerical_features = [
    "bathroom",
    "floor",
    "total_floor",
    "gym",
    "latitude",
    "longitude",
    "lift",
    "property_age",
    "property_size",
    "swimming_pool",
    "rent",
    "deposit",
    "photo_count",
]

categorical_features = ["type", "furnishing", "lease_type", "parking", "building_type"]

In [70]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessing_pipeline = Pipeline([
    ('preprocessor',preprocessor)
])

In [71]:
def return_torch_splits(X,y):
    X_train, X_test, y_train, y_test = train_test_split_features(X,y)


    X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
    X_test_preprocessed = preprocessing_pipeline.transform(X_test) 


    X_train_preprocessed_torch = torch.tensor(X_train_preprocessed, dtype=torch.float32)
    X_test_preprocessed_torch = torch.tensor(X_test_preprocessed, dtype=torch.float32)
    y_train_torch = torch.tensor(y_train.values, dtype=torch.float32)
    y_test_torch = torch.tensor(y_test.values, dtype=torch.float32)

    return X_train_preprocessed_torch, X_test_preprocessed_torch, y_train_torch, y_test_torch

In [72]:
def log_model_structure(model,input_size: int):
    model_summary = torchinfo.summary(model,input_size=[input_size])
    
    mlflow.log_text(str(model_summary),'model_structure.txt')

In [73]:
def compute_mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

def compute_rmse(y_true, y_pred):
    return torch.sqrt(compute_mse(y_true, y_pred))

def compute_mae(y_true, y_pred):
    return torch.abs(y_true - y_pred).mean()

## Regression

In [74]:
class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size,32) 
        self.fc2 = nn.Linear(32,32) 
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.4)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x) 
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x) 
        x = self.dropout(x)
        x = self.output_layer(x)
        return x

In [75]:
EPOCHS = 80 
BATCH_SIZE = 62
OPTIMIZER = "ADAM"
LEARNING_RATE = 0.001
CRITERION = "HuberLoss"
INPUT_SIZE = 34
WEIGHT_DECAY = 1e-2
HUBER_LOSS_DELTA = 1.0 
RUN_NAME = "dropout-relu-mse-loss-complex-model"
SMOOTHING_Y = 'NA'

In [76]:
model = Model(input_size=INPUT_SIZE)

criterion = nn.HuberLoss(delta=HUBER_LOSS_DELTA) 

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

## Regression 3d

In [77]:

X_train_preprocessed_torch, X_test_preprocessed_torch, y_train_torch, y_test_torch = return_torch_splits(X, y_regression_3d)

train_dataset = torch.utils.data.TensorDataset(
    X_train_preprocessed_torch, y_train_torch
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True
)

val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=True
)

In [78]:
mlflow.set_tracking_uri('http://127.0.0.1:5000/')
mlflow.set_experiment("regression-3d-predictions")


<Experiment: artifact_location='mlflow-artifacts:/246515668344709527', creation_time=1742966815476, experiment_id='246515668344709527', last_update_time=1742966815476, lifecycle_stage='active', name='regression-3d-predictions', tags={}>

In [79]:
with mlflow.start_run():
    mlflow.log_params(
        {
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "optimizer": OPTIMIZER,
            "learning_rate": LEARNING_RATE,
            "criterion": CRITERION,
            "input_size": INPUT_SIZE,
            "weight_decay": WEIGHT_DECAY,
            "huber_loss_delta": HUBER_LOSS_DELTA,
            "smoothing_y": SMOOTHING_Y,
        }
    )

    log_model_structure(model, INPUT_SIZE)

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        total_train_rmse = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            train_rmse = criterion(predictions, y_batch)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            total_train_rmse += train_rmse.item()

        model.eval()
        with torch.no_grad():
            val_loss = 0
            total_mse = 0
            total_rmse = 0
            total_mae = 0
            for Xv, yv in val_loader:
                outputs = model(Xv)

                val_loss += criterion(outputs, yv).item()

                # Compute metrics for a batch
                mse = compute_mse(yv, outputs)
                rmse = compute_rmse(yv, outputs)
                mae = compute_mae(yv, outputs)

                total_mse += mse.item()
                total_rmse = rmse.item()
                total_mae += mae.item()

            # Compute Averages
            avg_val_loss = val_loss / len(val_loader)
            avg_mse = total_mse / len(val_loader)
            avg_rmse = total_rmse / len(val_loader)
            avg_mae = total_mae / len(val_loader)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            avg_train_loss = total_loss / len(train_loader)
            avg_train_rmse = total_train_rmse / len(train_loader)
            print(
                f"Epoch [{epoch + 1}/{EPOCHS}] | Train Loss: {avg_train_loss:.4f} | "
                f"Val Loss: {avg_val_loss:.4f} | Val MSE: {avg_mse:.4f} | "
                f"Val RMSE: {avg_rmse:.4f} | Val MAE: {avg_mae:.4f}"
            )

            mlflow.log_metric("train_loss", avg_train_loss, step=epoch + 1)
            mlflow.log_metric("validation_loss", avg_val_loss, step=epoch + 1)
            mlflow.log_metric("val_mse", avg_mse, step=epoch + 1)
            mlflow.log_metric('val_rmse', avg_rmse, step=epoch + 1) 
            mlflow.log_metric('train_rmse', avg_train_rmse, step=epoch + 1)
            mlflow.log_metric("val_mae", avg_mae, step=epoch + 1)
            
            

Epoch [1/80] | Train Loss: 0.9223 | Val Loss: 0.8619 | Val MSE: 6.6823 | Val RMSE: 0.0115 | Val MAE: 1.2468
Epoch [10/80] | Train Loss: 0.8868 | Val Loss: 0.8545 | Val MSE: 6.4688 | Val RMSE: 0.0124 | Val MAE: 1.2458
Epoch [20/80] | Train Loss: 0.8874 | Val Loss: 0.8567 | Val MSE: 6.4759 | Val RMSE: 0.0216 | Val MAE: 1.2487
Epoch [30/80] | Train Loss: 0.8873 | Val Loss: 0.8513 | Val MSE: 6.3949 | Val RMSE: 0.0187 | Val MAE: 1.2433
Epoch [40/80] | Train Loss: 0.8850 | Val Loss: 0.8462 | Val MSE: 6.4155 | Val RMSE: 0.0056 | Val MAE: 1.2367
Epoch [50/80] | Train Loss: 0.8854 | Val Loss: 0.8529 | Val MSE: 6.4661 | Val RMSE: 0.0279 | Val MAE: 1.2445
Epoch [60/80] | Train Loss: 0.8859 | Val Loss: 0.8678 | Val MSE: 6.8578 | Val RMSE: 0.0742 | Val MAE: 1.2582
Epoch [70/80] | Train Loss: 0.8843 | Val Loss: 0.8495 | Val MSE: 6.3323 | Val RMSE: 0.0114 | Val MAE: 1.2444
Epoch [80/80] | Train Loss: 0.8865 | Val Loss: 0.8484 | Val MSE: 6.4740 | Val RMSE: 0.0113 | Val MAE: 1.2376
🏃 View run inquisiti

In [80]:
with torch.no_grad(): 
    predictions = model(X_test_preprocessed_torch) 
    rmse = compute_rmse(y_test_torch, predictions) 
    print(mse)

tensor(1.1183)
