### imports & data

In [6]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import dagshub

In [4]:
df = pd.read_excel(r"I:\CampusX_DS\campusx_dsmp2\9. MLOps revisited\tutorial\Used-Car-Price-Predictor\experiment\cars24_v3.xlsx")

In [5]:
import dagshub
dagshub.init(repo_owner='iamprashantjain', repo_name='Used-Car-Price-Predictor', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow")
mlflow.set_experiment('BaseLine Model')

2025/07/18 14:08:14 INFO mlflow.tracking.fluent: Experiment with name 'BaseLine Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/986edfba90e24830a2050d400151c3c4', creation_time=1752827895205, experiment_id='0', last_update_time=1752827895205, lifecycle_stage='active', name='BaseLine Model', tags={}>

### experiment 1 - find best model

In [7]:
# --- Models ---
regression_models = {
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor()
}

# --- Preprocessing ---
numerical_cols = ['listingPrice', 'odometer', 'fitnessAge', 'cashDownPayment', 'emiStartingValue',
                  'emiEndingValue', 'roiMinDiscounted', 'roiMaxDiscounted', 'roiMinOriginal',
                  'roiMaxOriginal', 'emiOriginalStartingValue', 'emiOriginalEndingValue', 'featureCount', 'avgEmi']

categorical_cols = ['make', 'model', 'variant', 'year', 'transmissionType', 'bodyType', 'fuelType',
                    'ownership', 'color', '360DegreeCamera', 'AlloyWheels', 'AppleCarplayAndroidAuto',
                    'Bluetooth', 'CruiseControl', 'GpsNavigation', 'InfotainmentSystem', 'LeatherSeats',
                    'ParkingAssist', 'PushButtonStart', 'RearAc', 'SpecialRegNo', 'Sunroof/Moonroof',
                    'TopModel', 'Tpms', 'VentilatedSeats']

# Impute missing values
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Encode categorical
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine
df_encoded = pd.concat([df[numerical_cols], encoded_df], axis=1)

# Split X and y
X = df_encoded.drop(columns=["listingPrice"])
y = df_encoded["listingPrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow Tracking ---
with mlflow.start_run(run_name="RandomForest_vs_XGBoost") as parent_run:
    mlflow.set_tag("experiment_type", "regression_comparison_2models")
    mlflow.log_param("test_size", 0.2)

    for model_name, model in regression_models.items():
        with mlflow.start_run(run_name=model_name, nested=True):
            mlflow.log_param("model", model_name)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2_score", r2)

            mlflow.sklearn.log_model(model, f"{model_name}_model")

            print(f"\nModel: {model_name}")
            print(f"  MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")
            print("-" * 40)





Model: RandomForestRegressor
  MAE: 23474.94, MSE: 19281890090.47, R2: 0.94
----------------------------------------


2025/07/18 14:13:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestRegressor at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/c92e9c112bf245f0a3741dd79ca863f6.
2025/07/18 14:13:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.



Model: XGBRegressor
  MAE: 34178.28, MSE: 36337214232.42, R2: 0.88
----------------------------------------


2025/07/18 14:14:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBRegressor at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/070c72fdff4a4d9b93970f0306b324ef.
2025/07/18 14:14:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.
2025/07/18 14:14:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest_vs_XGBoost at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/aa49f7abea8547b3bd78287c7ac30ceb.
2025/07/18 14:14:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


### experiment 2 - find best params

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import numpy as np

# --- Define parameter grid ---
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# --- MLflow Parent Run ---
with mlflow.start_run(run_name="RandomForest_Hyperparameter_Tuning") as parent_run:
    mlflow.set_tag("experiment_type", "random_forest_hyperparameter_tuning")

    best_r2 = -np.inf
    best_params = None
    best_model = None

    run_counter = 1

    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_samples_split in param_grid['min_samples_split']:
                params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split
                }

                with mlflow.start_run(run_name=f"run_{run_counter}", nested=True):
                    mlflow.log_params(params)

                    model = RandomForestRegressor(**params, random_state=42)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)

                    mae = mean_absolute_error(y_test, y_pred)
                    mse = mean_squared_error(y_test, y_pred)
                    r2 = r2_score(y_test, y_pred)

                    mlflow.log_metrics({
                        "mae": mae,
                        "mse": mse,
                        "r2_score": r2
                    })

                    mlflow.sklearn.log_model(model, "model")

                    print(f"Run {run_counter}: R2={r2:.4f}, Params={params}")

                    if r2 > best_r2:
                        best_r2 = r2
                        best_params = params
                        best_model = model

                    run_counter += 1

    # Log best model in parent run
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
    mlflow.log_metric("best_r2_score", best_r2)
    mlflow.sklearn.log_model(best_model, "best_random_forest_model")

    print(f"\n✅ Best Model R2: {best_r2:.4f}, Best Params: {best_params}")



Run 1: R2=0.9348, Params={'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2}


2025/07/18 14:18:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_1 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/72be8ecafd884df0943b06cef31c5840.
2025/07/18 14:18:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 2: R2=0.9368, Params={'n_estimators': 100, 'max_depth': None, 'min_samples_split': 5}


2025/07/18 14:19:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_2 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/81f26bc8e86d40eaa2bfcec4b750a914.
2025/07/18 14:19:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 3: R2=0.9407, Params={'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2}


2025/07/18 14:21:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_3 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/1da647e1a4444784876c874667fad092.
2025/07/18 14:21:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 4: R2=0.9367, Params={'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5}


2025/07/18 14:22:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_4 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/35ee82be5dcf4f4b8d171fd6f174d92e.
2025/07/18 14:22:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 5: R2=0.9357, Params={'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2}


2025/07/18 14:24:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_5 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/38e3e1883fbb4b56894af59d1561d93a.
2025/07/18 14:24:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 6: R2=0.9373, Params={'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5}


2025/07/18 14:26:32 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_6 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/b98852f7ffbf4e4c963f6be7dcf19e4e.
2025/07/18 14:26:32 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 7: R2=0.9282, Params={'n_estimators': 200, 'max_depth': None, 'min_samples_split': 2}


2025/07/18 14:30:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_7 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/9cb8a26a10c24d3d91ea826002f51ca5.
2025/07/18 14:30:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 8: R2=0.9298, Params={'n_estimators': 200, 'max_depth': None, 'min_samples_split': 5}


2025/07/18 14:33:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_8 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/3b567ac5c88f4da7bd4e511ffe5fa710.
2025/07/18 14:33:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 9: R2=0.9316, Params={'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2}


2025/07/18 14:35:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_9 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/4fd0bdab981a4ad0991a3c4cb047ea90.
2025/07/18 14:35:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 10: R2=0.9296, Params={'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5}


2025/07/18 14:37:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_10 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/d5a57f5af9b748548a5d0f1762dc8a1d.
2025/07/18 14:37:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 11: R2=0.9289, Params={'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 2}


2025/07/18 14:41:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_11 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/b0be6fa658ec443aa7412a7516cd68c0.
2025/07/18 14:41:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 12: R2=0.9304, Params={'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5}


2025/07/18 14:45:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_12 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/76fe4767d7c349e5a6d18a669406abc7.
2025/07/18 14:45:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 13: R2=0.9258, Params={'n_estimators': 300, 'max_depth': None, 'min_samples_split': 2}


2025/07/18 14:50:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_13 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/fce29a8bc20d4a55a05ee18b998eaa90.
2025/07/18 14:50:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 14: R2=0.9264, Params={'n_estimators': 300, 'max_depth': None, 'min_samples_split': 5}


2025/07/18 14:53:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_14 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/4dfbaf31ec794c84987ea6416cd1596d.
2025/07/18 14:53:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 15: R2=0.9274, Params={'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 2}


2025/07/18 14:55:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_15 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/a09f4e57aab24030b851a621f2e80a42.
2025/07/18 14:55:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 16: R2=0.9266, Params={'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 5}


2025/07/18 14:57:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_16 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/2f6a10fb581d46b2819c407a79ce6b9a.
2025/07/18 14:57:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 17: R2=0.9238, Params={'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2}


2025/07/18 15:01:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_17 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/b01af7de18a6415cb9d1ba0028bb7082.
2025/07/18 15:01:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.


Run 18: R2=0.9270, Params={'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 5}


2025/07/18 15:04:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run run_18 at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/53a3a093b2114319b380f659e38e5536.
2025/07/18 15:04:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.



✅ Best Model R2: 0.9407, Best Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2}


2025/07/18 15:04:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest_Hyperparameter_Tuning at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0/runs/446ada565df54c9bb616a4ebd285d335.
2025/07/18 15:04:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/iamprashantjain/Used-Car-Price-Predictor.mlflow/#/experiments/0.
