In [25]:
import os
import mlflow
import inspect
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Lasso

In [26]:
df = pd.read_excel(r"D:\campusx_dsmp2\9. MLOps revisited\cars24_mlops_project\experiment\cars24_v3.xlsx")

In [27]:
# Assuming the dataframe is called df
df['year'] = pd.to_numeric(df['year'], errors='coerce')  # Convert 'year' to numeric

# Convert 'ownership' to numeric (e.g., 'Owned' -> 1, 'Leased' -> 0)
df['ownership'] = df['ownership'].map({'Owned': 1, 'Leased': 0})

# Convert boolean-like columns to numeric (1 for True, 0 for False)
bool_columns = [
    '360DegreeCamera', 'AlloyWheels', 'AppleCarplayAndroidAuto', 'Bluetooth',
    'CruiseControl', 'GpsNavigation', 'InfotainmentSystem', 'LeatherSeats',
    'ParkingAssist', 'PushButtonStart', 'RearAc', 'SpecialRegNo', 'Sunroof/Moonroof',
    'TopModel', 'Tpms', 'VentilatedSeats'
]

df[bool_columns] = df[bool_columns].applymap(lambda x: 1 if x else 0)

In [28]:
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()

In [29]:
import dagshub
dagshub.init(repo_owner='iamprashantjain', repo_name='MLOps_UsedCarPricePrediction', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/iamprashantjain/MLOps_UsedCarPricePrediction.mlflow")
mlflow.set_experiment('Hyper parameter Tuning')

<Experiment: artifact_location='mlflow-artifacts:/df404e35cc254c3f97726665f82eab68', creation_time=1746072220517, experiment_id='3', last_update_time=1746072220517, lifecycle_stage='active', name='Hyper parameter Tuning', tags={}>

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

# Define target and features
target_col = "listingPrice"
X = df.drop(columns=[target_col])
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify column types
numerical_cols = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object', 'bool']).columns.tolist()

# Preprocessing components
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Define base model
lasso = Lasso(max_iter=10000)

# Full pipeline with preprocessing, feature selection, and regression
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', RFECV(estimator=lasso, step=1, cv=3, scoring='r2')),
    ('model', lasso)
])

# Hyperparameter grid for Lasso
param_grid = {
    'model__alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 10.0]
}

# Grid Search
grid_search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

# Start MLflow run
with mlflow.start_run(run_name="Lasso_RFECV_HyperparamTuning") as parent_run:
    mlflow.set_tag("experiment_type", "Lasso_with_RFECV_GridSearch")

    # Log source code file (optional)
    try:
        mlflow.log_artifact(
            r"D:\campusx_dsmp2\9. MLOps revisited\cars24_mlops_project\experiment\6_Experiment3_HyperParameterTuning.ipynb",
            artifact_path="source_code"
        )
    except:
        print("⚠️ Could not log notebook file.")

    try:
        # Fit grid search on raw X_train
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        # Evaluate
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Log parameters and metrics
        mlflow.log_param("model", "Lasso")
        mlflow.log_param("scaler", "StandardScaler")
        mlflow.log_param("imputer", "SimpleMean")
        mlflow.log_param("vectorizer", "OHE")
        mlflow.log_param("feature_selector", "RFECV")
        mlflow.log_param("best_alpha", grid_search.best_params_['model__alpha'])

        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("R2", r2)

        # Log model
        mlflow.sklearn.log_model(best_model, artifact_path="lasso_rfecv_model")

        # Save and log predictions
        results_df = pd.DataFrame({
            "Actual": y_test,
            "Predicted": y_pred
        })
        results_file = "lasso_rfecv_results.csv"
        results_df.to_csv(results_file, index=False)
        mlflow.log_artifact(results_file, artifact_path="predictions")
        os.remove(results_file)

        print(f"✅ Lasso + RFECV | Best Alpha: {grid_search.best_params_['model__alpha']} | MAE: {mae:.2f}, R2: {r2:.2f}")

    except Exception as e:
        print(f"❌ Error in Lasso + RFECV: {e}")
        mlflow.log_param("error", str(e))