### import and data

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import dagshub
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import yaml
import os

In [2]:
df = pd.read_excel(r"cars24_v3.xlsx")

### mlflow-dagshub setup
- read details from params & os env var

In [5]:
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Now you can access your DAGSUB_PAT
DAGSHUB_PAT = os.getenv("DAGSHUB_PAT")

In [7]:
# Load parameters from params.yaml
with open(r"I:\CampusX_DS\campusx_dsmp2\9. MLOps revisited\tutorial\UsedCarPricePredictor\params.yaml", "r") as f:
    params = yaml.safe_load(f)

repo_owner = params["mlflow"]["repo_owner"]
repo_name = params["mlflow"]["repo_name"]

# Set up DagsHub credentials for MLflow tracking
dagshub_token = os.getenv("DAGSHUB_PAT")
if not dagshub_token:
    raise EnvironmentError("DAGSHUB_PAT environment variable is not set")

os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

dagshub_url = "https://dagshub.com"
mlflow.set_tracking_uri(f'{dagshub_url}/{repo_owner}/{repo_name}.mlflow')

### Exp 1: find best model

In [10]:
# --- models ---
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor()
}

# --- Preprocessing ---
numerical_cols = ['listingPrice', 'odometer', 'fitnessAge','featureCount']
categorical_cols = ['make', 'model', 'variant', 'year', 'transmissionType', 'bodyType', 'fuelType','ownership', 'color']

# Impute missing values
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Encode categorical
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_categorical = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))

# Combine
df_encoded = pd.concat([df[numerical_cols], encoded_df], axis=1)

# Split X and y
X = df_encoded.drop(columns=["listingPrice"])
y = df_encoded["listingPrice"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- MLflow Tracking ---
with mlflow.start_run(run_name="Find Best Model") as parent_run:
    mlflow.set_tag("experiment_type", "All ML Models")
    mlflow.log_param("test_size", 0.2)

    for model_name, model in models.items():
        with mlflow.start_run(run_name=model_name, nested=True):
            mlflow.log_param("model", model_name)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            #log metrics
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2_score", r2)

            #log model
            mlflow.sklearn.log_model(model, f"{model_name}_model")
            
            #log code file
            mlflow.log_artifact("experiments.ipynb")

            print(f"\nModel: {model_name}")
            print(f"  MAE: {mae:.2f}, MSE: {mse:.2f}, R2: {r2:.2f}")
            print("-" * 40)




Model: Linear Regression
  MAE: 40516553969.24, MSE: 64350810197141720399872.00, R2: -208443009422.29
----------------------------------------

Model: Ridge
  MAE: 110939.74, MSE: 38489205335.46, R2: 0.88
----------------------------------------


  model = cd_fast.enet_coordinate_descent(



Model: Lasso
  MAE: 100750.06, MSE: 57869339670.66, R2: 0.81
----------------------------------------

Model: Decision Tree
  MAE: 143857.90, MSE: 101028675344.09, R2: 0.67
----------------------------------------

Model: Random Forest
  MAE: 116033.47, MSE: 79768242115.56, R2: 0.74
----------------------------------------

Model: XGBoost
  MAE: 112777.99, MSE: 56937918980.86, R2: 0.82
----------------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 753
[LightGBM] [Info] Number of data points in the train set: 5912, number of used features: 159
[LightGBM] [Info] Start training from score 697057.494756

Model: LightGBM
  MAE: 146777.97, MSE: 113607140582.79, R2: 0.63
----------------------------------------


### Exp2: Hyperparameter Tuning

- Tune hyper parameters of the best model (lowest MAE & highest R2) -- XGBoost

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import numpy as np
from xgboost import XGBRegressor

# --- Define parameter grid ---
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2]
}

# --- MLflow Parent Run ---
with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning") as parent_run:
    mlflow.set_tag("experiment_type", "xgboost_hyperparameter_tuning")

    best_r2 = -np.inf
    best_params = None
    best_model = None

    run_counter = 1

    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for learning_rate in param_grid['learning_rate']:
                params = {
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                    'learning_rate': learning_rate,
                    'random_state': 42,
                    'verbosity': 0
                }

                with mlflow.start_run(run_name=f"run_{run_counter}", nested=True):
                    mlflow.log_params(params)

                    model = XGBRegressor(**params)
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)

                    mae = mean_absolute_error(y_test, y_pred)
                    mse = mean_squared_error(y_test, y_pred)
                    r2 = r2_score(y_test, y_pred)

                    mlflow.log_metrics({
                        "mae": mae,
                        "mse": mse,
                        "r2_score": r2
                    })

                    mlflow.sklearn.log_model(model, "model")
                    
                    mlflow.log_artifact("experiments.ipynb")

                    print(f"Run {run_counter}: R2={r2:.4f}, Params={params}")

                    if r2 > best_r2:
                        best_r2 = r2
                        best_params = params
                        best_model = model

                    run_counter += 1

    # Log best model in parent run
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
    mlflow.log_metric("best_r2_score", best_r2)
    mlflow.sklearn.log_model(best_model, "best_xgboost_model")

    print(f"\nBest Model R2: {best_r2:.4f}, Best Params: {best_params}")

Run 1: R2=0.3990, Params={'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01, 'random_state': 42, 'verbosity': 0}
Run 2: R2=0.7294, Params={'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'random_state': 42, 'verbosity': 0}
Run 3: R2=0.7559, Params={'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.2, 'random_state': 42, 'verbosity': 0}
Run 4: R2=0.5162, Params={'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.01, 'random_state': 42, 'verbosity': 0}
Run 5: R2=0.7903, Params={'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'random_state': 42, 'verbosity': 0}
Run 6: R2=0.8377, Params={'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.2, 'random_state': 42, 'verbosity': 0}
Run 7: R2=0.5887, Params={'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.01, 'random_state': 42, 'verbosity': 0}
Run 8: R2=0.8239, Params={'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.1, 'random_state': 42, 'verbosity': 0}
Run 9: R2=0.8290, Params={'

In [12]:
# create dvc pipeline of xgboost model - with Params: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.2, 'random_state': 42, 'verbosity': 0}