In [0]:
import mlflow
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import codecarbon

In [0]:
warnings.filterwarnings("ignore")

In [0]:

def train_and_log_model(final_dataframe, feature_set_name, params):
    X = final_dataframe.drop('total_lift', axis=1)
    y = final_dataframe[['total_lift', 'data_split']]

    X_train, y_train = X[X['data_split'] == 'train'].drop('data_split', axis=1), \
    y[y['data_split'] == 'train'].drop('data_split', axis=1)
    
    X_val, y_val = X[X['data_split'] == 'val'].drop('data_split', axis=1), \
    y[y['data_split'] == 'val'].drop('data_split', axis=1)
    
    X_test, y_test = X[X['data_split'] == 'test'].drop('data_split', axis=1), \
    y[y['data_split'] == 'test'].drop('data_split', axis=1)

    with mlflow.start_run(run_name=f"{feature_set_name}_{params['run_id']}"):
        # Initialize and train model
        model = xgb.XGBRegressor(**params['model_params'])
        model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            verbose=False
        )
        
        # Predict and evaluate
        preds = model.predict(X_test)
        rmse = mean_squared_error(y_test, preds, squared=False)
        r2 = r2_score(y_test, preds)

        # Log metadata
        mlflow.log_param("feature_set", feature_set_name)
        mlflow.log_params(params['model_params'])
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        # Actuals vs Predictions Plot
        plt.figure(figsize=(6,6))
        plt.scatter(y_test, preds, alpha=0.5)
        plt.xlabel("Actual Total Lift")
        plt.ylabel("Predicted Total Lift")
        plt.title("Prediction vs Actual")
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig("pred_vs_actual.png")
        mlflow.log_artifact("pred_vs_actual.png")
        plt.close()

        # Feature Importance Plot
        importances = model.feature_importances_
        features_df = pd.DataFrame({'feature': X_train.columns.tolist(), 'importance': importances})
        features_df.sort_values(by='importance', ascending=False, inplace=True)

        features_df.plot(kind='bar', x='feature', y='importance', legend=False)
        plt.ylabel('Feature Importance')
        plt.title('XGBoost Feature Importance')
        plt.tight_layout()
        plt.savefig("feature_importance.png")
        mlflow.log_artifact("feature_importance.png")
        plt.close()

        results = model.evals_result()

        # Learning curve plot
        plt.figure(figsize=(8,5))
        epochs = len(results['validation_0']['rmse'])

        plt.plot(range(epochs), results['validation_0']['rmse'], label='Train RMSE')
        plt.plot(range(epochs), results['validation_1']['rmse'], label='Validation RMSE')
        plt.xlabel('Boosting Round')
        plt.ylabel('RMSE')
        plt.title('XGBoost Learning Curve')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()

        # Save & log to MLflow
        plt.savefig("learning_curve.png")
        mlflow.log_artifact("learning_curve.png")
        plt.close()

        # Save model artifact
        mlflow.sklearn.log_model(model, artifact_path="model")

In [0]:
# Workaround required for mlflow when working with serverless compute:
# https://community.databricks.com/t5/machine-learning/using-datbricks-connect-with-serverless-compute-and-mlflow/td-p/97590

mlflow.tracking._model_registry.utils._get_registry_uri_from_spark_session = lambda: "databricks-uc"
# Specify the experiment path
experiment_path = "/Users/kunalmody@uchicago.edu/total_lift_estimation"
mlflow.set_experiment(experiment_path)

In [0]:
# Strength features (feature set 1)
feature_set1 = spark.read.table('workspace.default.strengthFeatures')
# Behavior features (feature set 2)
feature_set2 = spark.read.table('workspace.default.behaviorFeaturesModel')
# Label set
labels = spark.read.table('workspace.default.total_lift_labels')

In [0]:
# Create hyperparameter sets
hyperparams = [
    {"run_id": "hp1", "model_params": {"max_depth": 5, "learning_rate": 0.1, "n_estimators": 100}},
    {"run_id": "hp2", "model_params": {"max_depth": 7, "learning_rate": 0.05, "n_estimators": 150}},
]
# Create X and y dataframes
final_df1 = feature_set1.join(labels, on="athlete_id", how="inner").drop("athlete_id").toPandas()
final_df2 = feature_set2.join(labels, on="athlete_id", how="inner").drop("athlete_id").toPandas()
# Create final package for feature sets:
features_sets = {"strength": final_df1, "behavior": final_df2}

print("Strength Feature Set: ", final_df1.shape)
print("Behavior Feature Set: ", final_df2.shape)


In [0]:
for hyperparam in hyperparams:
    for feature_set_name, feature_set in features_sets.items():
        train_and_log_model(feature_set, feature_set_name, hyperparam)
        