In [48]:
import mlflow
import dagshub
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, make_scorer


In [2]:
df = pd.read_csv('../../data/processed/gurgaon_properties_post_feature_selection.csv')

In [3]:
X = df.drop(columns=['price'])
y = df['price']

In [4]:
y = np.log1p(y)

In [5]:
columns_to_scale = ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), columns_to_scale),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder='passthrough'
)

X = preprocessor.fit_transform(X)

In [24]:
import dagshub
import mlflow

dagshub.init(repo_owner='kevalsakhiya', repo_name='property-scout', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/kevalsakhiya/property-scout.mlflow')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


xgboost_model = XGBRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [1300],
    'max_depth': [3],
    'learning_rate': [0.04,0.05],
    'subsample': [0.8,0.9,1.0],
    'colsample_bytree': [1.0,0.9,0.8]
}


In [49]:
scoring = {'mse': make_scorer(mean_squared_error, greater_is_better=False),
           'r2': make_scorer(r2_score)}

grid_search = GridSearchCV(estimator=xgboost_model, 
                           param_grid=param_grid, 
                           refit='r2',
                           cv=5, 
                           scoring=scoring, 
                           verbose=1, 
                           n_jobs=-1)


# grid_search.fit(X_train, y_train)

In [53]:
mlflow.set_experiment('XGBoost-hyperparameter-tuning')
mlflow.autolog()
with mlflow.start_run():
    grid_search.fit(X_train, y_train)
    results = grid_search.cv_results_

    for i in range(len(results['params'])):
        with mlflow.start_run(nested=True):
            mlflow.log_params(results['params'][i])
            
            # r2 score
            mean_r2_score = results['mean_test_r2'][i]
            mlflow.log_metric('mean_r2_score',mean_r2_score)

            # mean mse
            mean_mse_score = -results['mean_test_mse'][i]
            mlflow.log_metric('mean_mse_score', mean_mse_score)

2024/08/02 13:01:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2024/08/02 13:01:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


2024/08/02 13:01:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
2024/08/02 13:01:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run dapper-ape-32 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/5/runs/0443a8a6c6334107b190d3f523aa4853.
2024/08/02 13:01:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/5.
2024/08/02 13:01:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run exultant-sheep-751 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/5/runs/e4f8200c35a64609a3b68940892dacfb.
2024/08/02 13:01:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experiments/5.
2024/08/02 13:01:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run placid-grouse-469 at: https://dagshub.com/kevalsakhiya/property-scout.mlflow/#/experim

In [51]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", best_params)

Best parameters found:  {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1300, 'subsample': 0.8}


In [54]:
y_pred = best_estimator.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse,r2

(0.0320801020886557, 0.8908352612259582)