In [100]:
import pandas as pd 
import numpy as np 
import torch 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error 

In [101]:
df = pd.read_parquet('data.parquet')
df['total_floor'] = df['total_floor'].fillna('-999') 
df['building_type'] = df['building_type'].fillna('missing')

In [102]:
def train_test_split_features(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,) 
    return X_train, X_test, y_train, y_test


In [116]:
X = df.drop(columns=['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class'])
y_set = df[['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class']]
y_regression_7d = df[['request_day_within_7d']]
y_regression_3d = df[['request_day_within_3d']]

y_regression_3d_log = np.log1p(y_regression_3d) 

y_regression_7d_log = np.log1p(y_regression_7d) 



encoder_7d = LabelEncoder()
y_classification_7d = encoder_7d.fit_transform(df['7d_class'])


encoder_3d = LabelEncoder()
y_classification_3d = encoder_3d.fit_transform(df['3d_class'])

In [117]:
numerical_features = [
    "bathroom",
    "floor",
    "total_floor",
    "gym",
    "latitude",
    "longitude",
    "lift",
    "property_age",
    "property_size",
    "swimming_pool",
    "rent",
    "deposit",
    "photo_count",
]

categorical_features = ['type','furnishing','lease_type','parking','building_type']

In [118]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessing_pipeline = Pipeline([
    ('preprocessor',preprocessor)
])

## Baseline Model 

In [131]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d_log)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 

In [133]:
from sklearn.linear_model import LinearRegression

baseline_linear_regression = LinearRegression()
baseline_linear_regression.fit(X_train_preprocessed, y_train)
y_pred = baseline_linear_regression.predict(X_test_preprocessed)

y_test_original = np.expm1(y_test)

y_pred_original = np.expm1(y_pred)

print(
    f"Baseline MSE {mean_squared_error(y_test_original, y_pred_original)} \nBaseline RMSE {root_mean_squared_error(y_test_original, y_pred_original)} \nBaseline MAE {mean_absolute_error(y_test_original, y_pred_original)}"
)

Baseline MSE 6.07527381637751 
Baseline RMSE 2.4648070545942353 
Baseline MAE 1.2537514003414993


In [134]:
from sklearn.ensemble import RandomForestRegressor 

rf = RandomForestRegressor()
rf.fit(X_train_preprocessed, y_train)
y_pred = rf.predict(X_test_preprocessed)

y_test_original = np.expm1(y_test)

y_pred_original = np.expm1(y_pred)

print(
    f"Baseline MSE {mean_squared_error(y_test_original, y_pred_original)} \nBaseline RMSE {root_mean_squared_error(y_test_original, y_pred_original)} \nBaseline MAE {mean_absolute_error(y_test_original, y_pred_original)}"
)

  return fit_method(estimator, *args, **kwargs)


Baseline MSE 4.513767493737246 
Baseline RMSE 2.124562894747352 
Baseline MAE 1.075170404160392


In [137]:
from xgboost import XGBRegressor 

xgb = XGBRegressor(
    objective="reg:squarederror",  # Use squared error loss for regression
    n_estimators=100,              # Number of boosting rounds
    learning_rate=0.1,             # Step size shrinkage
    max_depth=6,                   # Maximum depth of each tree
    subsample=0.8,                 # Fraction of samples used for each tree
    colsample_bytree=0.8,          # Fraction of features used for each tree           # For reproducibility
)

xgb.fit(X_train_preprocessed, y_train) 

y_pred = xgb.predict(X_test_preprocessed) 


y_test_original = np.expm1(y_test)

y_pred_original = np.expm1(y_pred)

print(
    f"Baseline MSE {mean_squared_error(y_test_original, y_pred_original)} \nBaseline RMSE {root_mean_squared_error(y_test_original, y_pred_original)} \nBaseline MAE {mean_absolute_error(y_test_original, y_pred_original)}"
)

Baseline MSE 4.7305121421813965 
Baseline RMSE 2.174973964691162 
Baseline MAE 1.0829192399978638


In [115]:
y_regression_3d.mean()

request_day_within_3d    1.281674
dtype: float64

### Torch Experiment 

In [79]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 


X_train_preprocessed_torch = torch.tensor(X_train_preprocessed, dtype=torch.float32)
X_test_preprocessed_torch = torch.tensor(X_test_preprocessed, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32)

print(X_train_preprocessed_torch.shape, y_train_torch.shape)

torch.Size([23110, 34]) torch.Size([23110, 1])


In [80]:
import torch 
import torch.nn as nn 
import torch.optim as optim 
from torchinfo import summary 


In [81]:
# class SimpleRegressionModel(nn.Module):
#     def __init__(self, input_size):
#         super().__init__()
#         self.fc1 = nn.Linear(input_size, 1)

#     def forward(self, x):
#         x = self.fc1(x)
#         return x


# simple_regression_model = SimpleRegressionModel(input_size=34)

# criterion = nn.MSELoss() 

# optimizer = optim.Adam(simple_regression_model.parameters(), lr = 0.001)


# batch_size = 264
# train_dataset = torch.utils.data.TensorDataset(
#     X_train_preprocessed_torch, y_train_torch
# )
# train_loader = torch.utils.data.DataLoader(
#     train_dataset, batch_size=batch_size, shuffle=True
# )

# val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
# val_loader = torch.utils.data.DataLoader(
#     val_dataset, batch_size=batch_size, shuffle=True
# )


# epochs = 100

# for epoch in range(epochs):
#     simple_regression_model.train()
#     total_loss = 0
#     for X_batch, y_batch in train_loader:
#         optimizer.zero_grad()
#         preds = simple_regression_model(X_batch)
#         loss = criterion(preds, y_batch)

#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()

#     simple_regression_model.eval()
#     with torch.no_grad():
#         val_loss = sum(
#             criterion(simple_regression_model(Xv), yv) for Xv, yv in val_loader
#         )

#     if (epoch + 1) % 10 == 0 or epoch == 0: 
#         avg_train_loss = total_loss / len(train_loader) 
#         avg_val_loss = val_loss / len(val_loader) 
#         print(f"Epoch [{epoch + 1}/{epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


In [82]:
# # Final Evaluation 

# with torch.no_grad(): 
#     predictions = simple_regression_model(X_test_preprocessed_torch) 
#     mse = criterion(predictions, y_test_torch).item() 
#     print(f"\nFinal Validation MSE: {mse:.4f}")

In [83]:
import mlflow 
import mlflow.pytorch 
import torchinfo



In [84]:
def log_model_structure(model,input_size: int):
    model_summary = torchinfo.summary(model,input_size=[input_size])
    
    mlflow.log_text(str(model_summary),'model_structure.txt')

In [None]:
class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size,64) 
        self.fc2 = nn.Linear(64,128) 
        self.fc3 = nn.Linear(128,64)
        self.output_layer = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x) 
        x = self.fc2(x)
        x = self.relu(x) 
        x = self.fc3(x)
        x = self.relu(x) 
        x = self.output_layer(x)
        return x

In [None]:
class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size,32) 
        self.fc2 = nn.Linear(32,32) 
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x) 
        x = self.fc2(x)
        x = self.relu(x) 
        x = self.output_layer(x)
        return x

In [139]:
import mlflow
mlflow.set_experiment("pytorch-3d-prediction")




class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size,32) 
        self.fc2 = nn.Linear(32,32) 
        self.output_layer = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.3)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x) 
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x) 
        x = self.dropout(x)
        x = self.output_layer(x)
        return x





EPOCHS = 120
BATCH_SIZE = 32
OPTIMIZER = "ADAM"
LEARNING_RATE = 0.001
CRITERION = "MSELoss"
INPUT_SIZE = 34


model = Model(input_size=INPUT_SIZE)

criterion = nn.HuberLoss(delta=1.0) 

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)


train_dataset = torch.utils.data.TensorDataset(
    X_train_preprocessed_torch, y_train_torch
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True
)

val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=True
)


criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

with mlflow.start_run(run_name='Linear Model with 3 FC Layers'):
    mlflow.log_params(
        {
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "optimizer": OPTIMIZER,
            "learning_rate": LEARNING_RATE,
            "criterion": CRITERION,
        }
    )
    log_model_structure(model, input_size=INPUT_SIZE) 

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        with torch.no_grad():
            val_loss = sum(
                criterion(model(Xv), yv) for Xv, yv in val_loader
            )

        if (epoch + 1) % 10 == 0 or epoch == 0: 
            avg_train_loss = total_loss / len(train_loader) 
            avg_val_loss = val_loss / len(val_loader) 
            print(f"Epoch [{epoch + 1}/{EPOCHS}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
            mlflow.log_metric("train_loss", avg_train_loss, step=epoch + 1) 
            mlflow.log_metric("validation_loss", avg_val_loss, step= epoch + 1 )
    
    mlflow.log_metric("final_train_loss", total_loss / len(train_loader))
    mlflow.log_metric("final_val_loss", val_loss / len(val_loader))
    mlflow.pytorch.log_model(pytorch_model=model, artifact_path='model')
            



Epoch [1/120] | Train Loss: 6.2725 | Val Loss: 5.8508
Epoch [10/120] | Train Loss: 5.3604 | Val Loss: 5.3355
Epoch [20/120] | Train Loss: 5.0832 | Val Loss: 5.2859
Epoch [30/120] | Train Loss: 4.9638 | Val Loss: 5.2703
Epoch [40/120] | Train Loss: 4.7982 | Val Loss: 5.3595
Epoch [50/120] | Train Loss: 4.7108 | Val Loss: 5.5680
Epoch [60/120] | Train Loss: 4.5169 | Val Loss: 5.4859
Epoch [70/120] | Train Loss: 4.3822 | Val Loss: 5.5994
Epoch [80/120] | Train Loss: 4.2649 | Val Loss: 5.5011
Epoch [90/120] | Train Loss: 4.1846 | Val Loss: 5.6451
Epoch [100/120] | Train Loss: 4.0155 | Val Loss: 5.9305
Epoch [110/120] | Train Loss: 3.9668 | Val Loss: 5.7960
Epoch [120/120] | Train Loss: 3.8646 | Val Loss: 5.7864




In [143]:
df

Unnamed: 0,type,bathroom,floor,total_floor,furnishing,gym,latitude,longitude,lease_type,lift,...,property_size,swimming_pool,rent,deposit,building_type,photo_count,request_day_within_7d,7d_class,request_day_within_3d,3d_class
0,BHK2,1,3,4.0,SEMI_FURNISHED,1,12.876174,77.596571,FAMILY,1,...,850,1,12000,120000,AP,7,4,3_to_5,4,3_to_5
1,BHK2,2,4,11.0,SEMI_FURNISHED,1,13.018444,77.678122,FAMILY,1,...,1233,1,20000,150000,AP,0,3,3_to_5,3,3_to_5
2,BHK2,2,0,4.0,NOT_FURNISHED,1,12.975072,77.665865,ANYONE,1,...,1200,0,15000,75000,AP,12,15,above_5,15,above_5
3,BHK3,2,3,4.0,SEMI_FURNISHED,0,12.888169,77.591282,ANYONE,0,...,1300,0,17000,150000,AP,9,5,3_to_5,5,3_to_5
4,BHK1,1,1,2.0,SEMI_FURNISHED,0,12.990243,77.712962,ANYONE,0,...,450,0,6500,40000,IF,7,25,above_5,25,above_5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28883,BHK2,1,0,2.0,SEMI_FURNISHED,0,12.942613,77.621890,FAMILY,0,...,1200,0,23000,200000,IF,0,5,3_to_5,5,3_to_5
28884,BHK3,2,3,3.0,SEMI_FURNISHED,0,12.906331,77.591790,FAMILY,1,...,1310,0,22000,125000,AP,7,2,1_to_2,2,1_to_2
28885,BHK2,2,6,14.0,SEMI_FURNISHED,1,12.904363,77.526863,FAMILY,1,...,975,1,12500,50000,AP,6,0,equals_0,0,equals_0
28886,BHK2,1,1,2.0,SEMI_FURNISHED,0,12.938007,77.629097,ANYONE,0,...,600,0,14000,80000,IF,0,2,1_to_2,2,1_to_2


In [145]:
def compute_mse(y_true, y_pred): 
    return ((y_true - y_pred) ** 2).mean() 

def compute_rmse(y_true, y_pred): 
    return torch.sqrt(compute_mse(y_true, y_pred)) 

def compute_mae(y_true, y_pred): 
    return torch.abs(y_true - y_pred).mean()

In [None]:
model.eval()
with torch.no_grad():
    val_loss = 0
    total_mse = 0
    total_rmse = 0
    total_mae = 0

    for Xv, yv in val_loader:
        # Forward pass
        outputs = model(Xv)

        # Compute loss
        val_loss += criterion(outputs, yv).item()

        # Compute metrics
        mse = compute_mse(yv, outputs)
        rmse = compute_rmse(yv, outputs)
        mae = compute_mae(yv, outputs)

        # Accumulate metrics
        total_mse += mse.item()
        total_rmse += rmse.item()
        total_mae += mae.item()

    # Compute averages
    avg_val_loss = val_loss / len(val_loader)
    avg_mse = total_mse / len(val_loader)
    avg_rmse = total_rmse / len(val_loader)
    avg_mae = total_mae / len(val_loader)

# Log training and validation metrics every 10 epochs or at the start
if (epoch + 1) % 10 == 0 or epoch == 0:
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{EPOCHS}] | Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | Val MSE: {avg_mse:.4f} | "
          f"Val RMSE: {avg_rmse:.4f} | Val MAE: {avg_mae:.4f}")

    # Log metrics to MLflow
    mlflow.log_metric("train_loss", avg_train_loss, step=epoch + 1)
    mlflow.log_metric("validation_loss", avg_val_loss, step=epoch + 1)
    mlflow.log_metric("val_mse", avg_mse, step=epoch + 1)
    mlflow.log_metric("val_rmse", avg_rmse, step=epoch + 1)
    mlflow.log_metric("val_mae", avg_mae, step=epoch + 1)

In [None]:
def compute_mse(y_true, y_pred):
    return ((y_true - y_pred) ** 2).mean()

def compute_rmse(y_true, y_pred):
    return torch.sqrt(compute_mse(y_true, y_pred))

def compute_mae(y_true, y_pred):
    return torch.abs(y_true - y_pred).mean()