In [28]:
import pandas as pd 
import numpy as np 
import torch 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error 

In [29]:
df = pd.read_parquet('data.parquet')
df['total_floor'] = df['total_floor'].fillna('-999') 
df['building_type'] = df['building_type'].fillna('missing')

In [30]:
def train_test_split_features(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2) 
    return X_train, X_test, y_train, y_test


In [31]:
X = df.drop(columns=['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class'])
y_set = df[['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class']]
y_regression_7d = df[['request_day_within_7d']]
y_regression_3d = df[['request_day_within_3d']]


encoder_7d = LabelEncoder()
y_classification_7d = encoder_7d.fit_transform(df['7d_class'])


encoder_3d = LabelEncoder()
y_classification_3d = encoder_3d.fit_transform(df['3d_class'])

In [32]:
numerical_features = [
    "bathroom",
    "floor",
    "total_floor",
    "gym",
    "latitude",
    "longitude",
    "lift",
    "property_age",
    "property_size",
    "swimming_pool",
    "rent",
    "deposit",
    "photo_count",
]

categorical_features = ['type','furnishing','lease_type','parking','building_type']

In [33]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessing_pipeline = Pipeline([
    ('preprocessor',preprocessor)
])

In [34]:
# cat_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

## Baseline Model 

In [36]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 

In [39]:
from sklearn.linear_model import LinearRegression 
baseline_linear_regression = LinearRegression() 
baseline_linear_regression.fit(X_train_preprocessed, y_train) 
print(f"Baseline MSE {mean_squared_error(y_test, baseline_linear_regression.predict(X_test_preprocessed))}")

Baseline MSE 6.04243435113111


### Torch Experiment 

In [19]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 


X_train_preprocessed_torch = torch.tensor(X_train_preprocessed, dtype=torch.float32)
X_test_preprocessed_torch = torch.tensor(X_test_preprocessed, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32)

print(X_train_preprocessed_torch.shape, y_train_torch.shape)

torch.Size([25999, 34]) torch.Size([25999, 1])


In [20]:
import torch 
import torch.nn as nn 
import torch.optim as optim 


In [21]:
class SimpleRegressionModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x


simple_regression_model = SimpleRegressionModel(input_size=34)

criterion = nn.MSELoss() 

optimizer = optim.Adam(simple_regression_model.parameters(), lr = 0.001)

In [22]:
batch_size = 264
train_dataset = torch.utils.data.TensorDataset(
    X_train_preprocessed_torch, y_train_torch
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True
)


epochs = 100

for epoch in range(epochs):
    simple_regression_model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = simple_regression_model(X_batch)
        loss = criterion(preds, y_batch)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    simple_regression_model.eval()
    with torch.no_grad():
        val_loss = sum(
            criterion(simple_regression_model(Xv), yv) for Xv, yv in val_loader
        )

    if (epoch + 1) % 10 == 0 or epoch == 0: 
        avg_train_loss = total_loss / len(train_loader) 
        avg_val_loss = val_loss / len(val_loader) 
        print(f"Epoch [{epoch + 1}/{epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


Epoch [1/100] | Train Loss: 8.2532 | Val Loss: 8.3356
Epoch [10/100] | Train Loss: 6.1646 | Val Loss: 6.7724
Epoch [20/100] | Train Loss: 6.0857 | Val Loss: 6.7353
Epoch [30/100] | Train Loss: 6.1871 | Val Loss: 6.6953
Epoch [40/100] | Train Loss: 6.0886 | Val Loss: 6.6864
Epoch [50/100] | Train Loss: 6.0787 | Val Loss: 6.6808
Epoch [60/100] | Train Loss: 6.0846 | Val Loss: 6.6846
Epoch [70/100] | Train Loss: 6.0741 | Val Loss: 6.6867
Epoch [80/100] | Train Loss: 6.0797 | Val Loss: 6.6793
Epoch [90/100] | Train Loss: 6.0778 | Val Loss: 6.6764
Epoch [100/100] | Train Loss: 6.0763 | Val Loss: 6.6746


In [25]:
# Final Evaluation 

with torch.no_grad(): 
    predictions = simple_regression_model(X_test_preprocessed_torch) 
    mse = criterion(predictions, y_test_torch).item() 
    print(f"\nFinal Validation MSE: {mse:.4f}")


Final Validation MSE: 6.6880


In [None]:
import mlflow 
import mlflow.pytorch 




np.float64(2.5861220740516044)

In [57]:
def log_model_structure(model):
    layers = {}
    for name, module in model.named_modules():
        if name == "":
            continue
    layers[name] = {
        "type": module.__class__.__name__,
        "parameters": sum(p.numel() for p in module.parameters()),
        "trainable_parameters": sum(
            p.numel() for p in module.parameters() if p.requires_grad
        ),
    }

    mlflow.log_dict(layers,'model_structure.json')

In [None]:
mlflow.set_experiment("pytorch-3d-prediction")



class SimpleRegressionModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x


simple_regression_model = SimpleRegressionModel(input_size=34)

criterion = nn.MSELoss() 

optimizer = optim.Adam(simple_regression_model.parameters(), lr = 0.001)



EPOCHS = 100
BATCH_SIZE = 32
OPTIMIZER = "ADAM"
LEARNING_RATE = 0.001
CRITERION = "MSELoss"


train_dataset = torch.utils.data.TensorDataset(
    X_train_preprocessed_torch, y_train_torch
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True
)

val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=True
)


criterion = nn.MSELoss()

optimizer = optim.Adam(simple_regression_model.parameters(), lr=0.001)

with mlflow.start_run():
    mlflow.log_params(
        {
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "optimizer": OPTIMIZER,
            "learning_rate": LEARNING_RATE,
            "criterion": CRITERION,
        }
    )
    log_model_structure(simple_regression_model) 

    for epoch in range(EPOCHS):
        simple_regression_model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            preds = simple_regression_model(X_batch)
            loss = criterion(preds, y_batch)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        simple_regression_model.eval()
        with torch.no_grad():
            val_loss = sum(
                criterion(simple_regression_model(Xv), yv) for Xv, yv in val_loader
            )

        if (epoch + 1) % 10 == 0 or epoch == 0: 
            avg_train_loss = total_loss / len(train_loader) 
            avg_val_loss = val_loss / len(val_loader) 
            print(f"Epoch [{epoch + 1}/{epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
            mlflow.log_metric("train_loss", avg_train_loss, step=epoch + 1) 
            mlflow.log_metric("validation_loss", avg_val_loss, step= epoch + 1 )
    
    mlflow.log_metric("final_train_loss", total_loss / len(train_loader))
    mlflow.log_metric("final_val_loss", val_loss / len(val_loader))
            

