In [2]:
import mlflow
import dagshub
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_csv('../../data/processed/gurgaon_properties_post_feature_selection.csv')

In [4]:
X = df.drop(columns=['price'])
y = df['price']

In [5]:
y = np.log1p(y)

In [6]:
columns_to_scale = ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough'
)

X = preprocessor.fit_transform(X)

In [7]:
import dagshub
import mlflow

dagshub.init(repo_owner='kevalsakhiya', repo_name='property-scout', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/kevalsakhiya/property-scout.mlflow')

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


random_forest = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [300],
    'max_depth': [50],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4, 8],
    # 'max_features': [1,2,3,4,5],
    # 'bootstrap': [True, False],
    'criterion': ['squared_error', 'absolute_error']
}



In [45]:
scoring = {'mse': make_scorer(mean_squared_error, greater_is_better=False),
           'r2': make_scorer(r2_score)}

grid_search = GridSearchCV(estimator=random_forest, 
                           param_grid=param_grid, 
                           refit='r2',
                           cv=5, 
                           scoring=scoring, 
                           verbose=1, 
                           n_jobs=-1)

In [46]:
mlflow.set_experiment('random-forest--hyperparameter-tuning')
mlflow.autolog()
with mlflow.start_run():
    grid_search.fit(X_train, y_train)
    results = grid_search.cv_results_

    for i in range(len(results['params'])):
        with mlflow.start_run(nested=True):
            mlflow.log_params(results['params'][i])
            
            # r2 score
            mean_r2_score = results['mean_test_r2'][i]
            mlflow.log_metric('mean_r2_score',mean_r2_score)

            # mean mse
            mean_mse_score = -results['mean_test_mse'][i]
            mlflow.log_metric('mean_mse_score', mean_mse_score)

2024/08/02 14:55:36 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


2024/08/02 15:00:42 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.
2024/08/02 15:00:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-ray-496 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/6/runs/ce7b196b46144117a3c60b92d45eefbc.
2024/08/02 15:00:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/6.
2024/08/02 15:00:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run kindly-doe-319 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/6/runs/692bf174a38c4f6e8d99f97db8671a94.
2024/08/02 15:00:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/6.
2024/08/02 15:00:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run bald-snake-731 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiment

In [52]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", best_params)

Best parameters found:  {'criterion': 'absolute_error', 'max_depth': 50, 'n_estimators': 300}


In [24]:
grid_search

In [51]:
y_pred = best_estimator.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse,r2

(0.03806801362488655, 0.870459116634889)