In [12]:
import pandas as pd 
import numpy as np 
import torch 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error 

In [13]:
df = pd.read_parquet('data.parquet')
df['total_floor'] = df['total_floor'].fillna('-999') 
df['building_type'] = df['building_type'].fillna('missing')

In [14]:
def train_test_split_features(X,y): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) 
    return X_train, X_test, y_train, y_test


In [15]:
X = df.drop(columns=['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class'])
y_set = df[['request_day_within_7d', '7d_class', 'request_day_within_3d', '3d_class']]
y_regression_7d = df[['request_day_within_7d']]
y_regression_3d = df[['request_day_within_3d']]


encoder_7d = LabelEncoder()
y_classification_7d = encoder_7d.fit_transform(df['7d_class'])


encoder_3d = LabelEncoder()
y_classification_3d = encoder_3d.fit_transform(df['3d_class'])

In [16]:
numerical_features = [
    "bathroom",
    "floor",
    "total_floor",
    "gym",
    "latitude",
    "longitude",
    "lift",
    "property_age",
    "property_size",
    "swimming_pool",
    "rent",
    "deposit",
    "photo_count",
]

categorical_features = ['type','furnishing','lease_type','parking','building_type']

In [17]:
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessing_pipeline = Pipeline([
    ('preprocessor',preprocessor)
])

In [18]:
# cat_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

In [27]:
X

Unnamed: 0,type,bathroom,floor,total_floor,furnishing,gym,latitude,longitude,lease_type,lift,parking,property_age,property_size,swimming_pool,rent,deposit,building_type,photo_count
0,BHK2,1,3,4.0,SEMI_FURNISHED,1,12.876174,77.596571,FAMILY,1,BOTH,2,850,1,12000,120000,AP,7
1,BHK2,2,4,11.0,SEMI_FURNISHED,1,13.018444,77.678122,FAMILY,1,BOTH,1,1233,1,20000,150000,AP,0
2,BHK2,2,0,4.0,NOT_FURNISHED,1,12.975072,77.665865,ANYONE,1,FOUR_WHEELER,0,1200,0,15000,75000,AP,12
3,BHK3,2,3,4.0,SEMI_FURNISHED,0,12.888169,77.591282,ANYONE,0,BOTH,1,1300,0,17000,150000,AP,9
4,BHK1,1,1,2.0,SEMI_FURNISHED,0,12.990243,77.712962,ANYONE,0,BOTH,4,450,0,6500,40000,IF,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28883,BHK2,1,0,2.0,SEMI_FURNISHED,0,12.942613,77.621890,FAMILY,0,BOTH,15,1200,0,23000,200000,IF,0
28884,BHK3,2,3,3.0,SEMI_FURNISHED,0,12.906331,77.591790,FAMILY,1,BOTH,8,1310,0,22000,125000,AP,7
28885,BHK2,2,6,14.0,SEMI_FURNISHED,1,12.904363,77.526863,FAMILY,1,BOTH,0,975,1,12500,50000,AP,6
28886,BHK2,1,1,2.0,SEMI_FURNISHED,0,12.938007,77.629097,ANYONE,0,TWO_WHEELER,5,600,0,14000,80000,IF,0


## Baseline Model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 

### Torch Experiment 

In [None]:
X_train, X_test, y_train, y_test = train_test_split_features(X,y_regression_3d)


X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train) 
X_test_preprocessed = preprocessing_pipeline.transform(X_test) 


X_train_preprocessed_torch = torch.tensor(X_train_preprocessed, dtype=torch.float32)
X_test_preprocessed_torch = torch.tensor(X_test_preprocessed, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.values, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.values, dtype=torch.float32)

print(X_train_preprocessed_torch.shape, y_train_torch.shape)

torch.Size([25999, 34]) torch.Size([25999, 1])


In [20]:
import torch 
import torch.nn as nn 
import torch.optim as optim 


In [21]:
class SimpleRegressionModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 1)

    def forward(self, x):
        x = self.fc1(x)
        return x


simple_regression_model = SimpleRegressionModel(input_size=34)

criterion = nn.MSELoss() 

optimizer = optim.Adam(simple_regression_model.parameters(), lr = 0.001)

In [22]:
batch_size = 264
train_dataset = torch.utils.data.TensorDataset(
    X_train_preprocessed_torch, y_train_torch
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

val_dataset = torch.utils.data.TensorDataset(X_test_preprocessed_torch, y_test_torch)
val_loader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=True
)


epochs = 100

for epoch in range(epochs):
    simple_regression_model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = simple_regression_model(X_batch)
        loss = criterion(preds, y_batch)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    simple_regression_model.eval()
    with torch.no_grad():
        val_loss = sum(
            criterion(simple_regression_model(Xv), yv) for Xv, yv in val_loader
        )

    if (epoch + 1) % 10 == 0 or epoch == 0: 
        avg_train_loss = total_loss / len(train_loader) 
        avg_val_loss = val_loss / len(val_loader) 
        print(f"Epoch [{epoch + 1}/{epochs}] | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


Epoch [1/100] | Train Loss: 8.2532 | Val Loss: 8.3356
Epoch [10/100] | Train Loss: 6.1646 | Val Loss: 6.7724
Epoch [20/100] | Train Loss: 6.0857 | Val Loss: 6.7353
Epoch [30/100] | Train Loss: 6.1871 | Val Loss: 6.6953
Epoch [40/100] | Train Loss: 6.0886 | Val Loss: 6.6864
Epoch [50/100] | Train Loss: 6.0787 | Val Loss: 6.6808
Epoch [60/100] | Train Loss: 6.0846 | Val Loss: 6.6846
Epoch [70/100] | Train Loss: 6.0741 | Val Loss: 6.6867
Epoch [80/100] | Train Loss: 6.0797 | Val Loss: 6.6793
Epoch [90/100] | Train Loss: 6.0778 | Val Loss: 6.6764
Epoch [100/100] | Train Loss: 6.0763 | Val Loss: 6.6746


In [25]:
# Final Evaluation 

with torch.no_grad(): 
    predictions = simple_regression_model(X_test_preprocessed_torch) 
    mse = criterion(predictions, y_test_torch).item() 
    print(f"\nFinal Validation MSE: {mse:.4f}")


Final Validation MSE: 6.6880


np.float64(2.5861220740516044)