In [0]:
# Install required packages
%pip install pandas numpy mlflow scikit-learn xgboost

# Removed Keras/Tensorflow/scikeras dependencies

import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature # <-- NEW IMPORT

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

# ML Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
import xgboost as xgb

#####################################################################
# 1. MLFLOW SETUP (REQUIRED IN DATABRICKS ENVIRONMENT)
#####################################################################

EXPERIMENT_NAME = "/Shared/Diamond Price Prediction" 
mlflow.set_experiment(EXPERIMENT_NAME)


#####################################################################
# 2. DATA LOADING AND PREPROCESSING
#####################################################################

def load_and_preprocess_data():
    """Loads, cleans, and prepares data for ML modeling."""
    print("Loading and cleaning data...")
    
    try:
        df = pd.read_csv("diamonds.csv")
    except FileNotFoundError:
        print("ERROR: diamonds.csv not found. Ensure the file is in the working directory.")
        return None, None, None, None, None

    # Clean data: Remove rows where physical dimensions (x, y, or z) are 0
    df = df[(df['x'] > 0) & (df['y'] > 0) & (df['z'] > 0)]

    # Separate features and target
    X = df.drop('price', axis=1)
    y = df['price']
    
    # Log-transform the target variable
    y = np.log1p(y)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Define feature groups
    numerical_features = ['carat', 'depth', 'table', 'x', 'y', 'z']
    categorical_features = ['cut', 'color', 'clarity']

    # Create preprocessing pipelines using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([('scaler', StandardScaler())]), numerical_features),
            ('cat', Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)
        ],
        remainder='passthrough'
    )

    print(f"Data split: Train={len(X_train)}, Test={len(X_test)}")
    return X_train, X_test, y_train, y_test, preprocessor

X_train, X_test, y_train, y_test, preprocessor = load_and_preprocess_data()
if X_train is None:
    exit()

#####################################################################
# 3. MODEL DEFINITIONS
#####################################################################

def build_random_forest(n_estimators=100, max_depth=None):
    """Returns a Random Forest Pipeline."""
    rf = RandomForestRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=42, 
        n_jobs=-1
    )
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', rf)
    ])

def build_svr(C=1.0, epsilon=0.1, kernel='rbf'):
    """Returns an SVR Pipeline, using LinearSVR for 'linear' kernel and setting
    a max_iter safety limit for all SVR models."""
    
    MAX_ITER = 1000000 
    
    if kernel == 'linear':
        svr = LinearSVR(
            C=C, 
            epsilon=epsilon, 
            max_iter=MAX_ITER,
            random_state=42
        )
    else:
        svr = SVR(
            C=C, 
            epsilon=epsilon, 
            kernel=kernel,
            max_iter=MAX_ITER
        )
        
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', svr)
    ])

def build_xgboost(n_estimators=100, learning_rate=0.1, max_depth=6):
    """Returns an XGBoost Pipeline."""
    xgbr = xgb.XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1,
        objective='reg:squarederror'
    )
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', xgbr)
    ])

#####################################################################
# 4. TRAINING AND MLFLOW LOGGING
#####################################################################

def evaluate_and_log_run(model, X_test, y_test, model_name, run_params):
    """Evaluates a model and logs results to MLflow, including a valid signature."""

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    # --- SIGNATURE INFERENCE FIX ---
    # 1. Take a small sample of the input data (Pandas DataFrame)
    X_sample = X_test.head(5) 
    # 2. Get predictions for the sample
    y_sample_pred = model.predict(X_sample)
    # 3. Flatten prediction array for scalar output signature
    if y_sample_pred.ndim > 1:
        y_sample_pred = y_sample_pred.flatten() 
        
    # 4. Infer the signature using the sample data
    signature = infer_signature(X_sample, y_sample_pred) # <-- KEY FIX
    # -------------------------------

    # Log parameters and metrics
    mlflow.log_params(run_params)
    mlflow.log_metric("R2_score", r2)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("RMSE", rmse)

    # Log model artifact, passing the signature
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=None,
        signature=signature # <-- PASSING SIGNATURE
    )

    print(f"--- {model_name} Run Complete ---")
    print(f"R2: {r2:.4f}, RMSE: {rmse:.4f}, Run ID: {mlflow.last_active_run().info.run_id}")
    return r2

# Define runs for each model type
runs_config = {
    "Random Forest": [
        {'n_estimators': 100, 'max_depth': 10},
        {'n_estimators': 200, 'max_depth': 15},
        {'n_estimators': 50, 'max_depth': 8}
    ],
    "SVR": [
        {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'},
        {'C': 100, 'epsilon': 0.01, 'kernel': 'rbf'},
        {'C': 1, 'epsilon': 0.1, 'kernel': 'linear'}
    ],
    "XGBoost": [
        {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 6},
        {'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 8},
        {'n_estimators': 50, 'learning_rate': 0.2, 'max_depth': 4}
    ]
}

best_r2 = -float('inf')
best_run_id = None
MODEL_NAME_FOR_REGISTRY = "DiamondPricePredictor"

# Main training loop
for model_name, configs in runs_config.items():
    print(f"\n--- Starting {model_name} Runs ---")
    
    for i, params in enumerate(configs):
        with mlflow.start_run(run_name=f"{model_name}-Run-{i+1}") as run:
            
            # 1. Select and build the model pipeline
            if model_name == "Random Forest":
                model_pipeline = build_random_forest(**params)
            elif model_name == "SVR":
                model_pipeline = build_svr(**params)
            elif model_name == "XGBoost":
                model_pipeline = build_xgboost(**params)
            else:
                raise ValueError(f"Unknown model type: {model_name}")

            # 2. Train the pipeline (preprocessor + model)
            model_pipeline.fit(X_train, y_train)
            
            # 3. Evaluate and log
            r2 = evaluate_and_log_run(model_pipeline, X_test, y_test, model_name, params)
            
            # Track the best run globally
            if r2 > best_r2:
                best_r2 = r2
                best_run_id = run.info.run_id
                print(f"-> NEW BEST RUN FOUND: {best_run_id} with R2={best_r2:.4f}")

#####################################################################
# 5. MODEL REGISTRATION
#####################################################################

print(f"\n=======================================================")
print(f"ALL 9 RUNS COMPLETED. BEST RUN ID: {best_run_id} (R2: {best_r2:.4f})")
print(f"To register the best model to the Databricks Model Registry, run the following code:")
print(f"=======================================================")

# --- Code for Model Registration (MUST BE UNCOMMENTED TO RUN) ---

# try:
#     # Get the URI of the best logged model artifact
#     logged_model_uri = f"runs:/{best_run_id}/model"
    
#     # Register the model to the Databricks Model Registry
#     model_details = mlflow.register_model(
#         model_uri=logged_model_uri,
#         name=MODEL_NAME_FOR_REGISTRY
#     )
    
#     print(f"Successfully registered model '{MODEL_NAME_FOR_REGISTRY}'.")
#     print(f"Version: {model_details.version}, URI: {model_details.model_uri}")

# except Exception as e:
#     print(f"Failed to register model. This often requires specific permissions.")
#     print(f"Error: {e}")

print(f"=======================================================")
print(f"The best run can be inspected in the MLflow UI using Run ID: {best_run_id}")