In [9]:
import numpy as np
import pandas as pd
import os
import optuna
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split, KFold
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import logging

logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [29]:
#Import Preprocess data
X_train = pd.read_csv("C:/Users/willi/Python/Spotify_Project/Data/X_train.csv")
y_train = pd.read_csv("C:/Users/willi/Python/Spotify_Project/Data/X_test.csv")
X_test = pd.read_csv("C:/Users/willi/Python/Spotify_Project/Data/y_train.csv")
y_test = pd.read_csv("C:/Users/willi/Python/Spotify_Project/Data/y_test.csv")
train = pd.read_csv("C:/Users/willi/Python/Spotify_Project/Data/preprocess_data.csv")

In [30]:
X = train.drop("mood", axis=1)
y = train["mood"]

In [4]:
os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/inouyewilliam/Master-Thesis.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "inouyewilliam"
os.environ["MLFLOW_TRACKING_PASSWORD"] ="b185d44c9fe85ded477875ff2ba1b4d229006006"

In [32]:
def evaluate_model(model, X, y, X_test, y_test):
    # Evaluate the model using cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    cv_score = np.mean(cv_scores)
    
    # Get the model predictions and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate the evaluation metrics
    roc_auc = roc_auc_score(y_test, y_proba)
    avg_precision= average_precision_score(y_test, y_proba)
    accuracy= accuracy_score(y_test, y_pred)
    precision= precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)
    f1= f1_score(y_test, y_pred)
    
    # Return a dictionary of evaluation metrics
    return {
        "cv_score": cv_score,
        "roc_auc": roc_auc,
        "average_precision": avg_precision,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [33]:
# Define the objective function to be optimized by Optuna
def objective(trial):
    k = trial.suggest_int("k", 5, X.shape[1])

    # Define the pipelines with different algorithms
    pipelines = {
        "et": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k=k)),
            ("model", ExtraTreesClassifier(
                n_estimators=trial.suggest_int("et_n_estimators", 50, 200),
                max_depth=trial.suggest_int("et_max_depth", 5, 20),
            )),
        ]),
        "lgbm": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k=k)),
            ("model", LGBMClassifier(
                #learning_rate=trial.suggest_loguniform("lgbm_learning_rate", 1e-3, 1e-1),
                max_depth=trial.suggest_int("lgbm_max_depth", 3, 10),
                n_estimators=trial.suggest_int("lgbm_n_estimators", 50, 200),
                num_leaves=trial.suggest_int("lgbm_num_leaves", 2, 50),
            )),
        ]),
        "xgb": Pipeline([
            ("scaler", StandardScaler()),
            ("selector", SelectKBest(f_classif, k=k)),
            ("model", XGBClassifier(
                #learning_rate=trial.suggest_loguniform("xgb_learning_rate", 1e-3, 1e-1),
                max_depth=trial.suggest_int("xgb_max_depth", 3, 10),
                n_estimators=trial.suggest_int("xgb_n_estimators", 50, 200),
                num_leaves=trial.suggest_int("xgb_num_leaves", 2, 50),
            )),
        ]),
    }
 # Train and evaluate each pipeline using cross-validation
    scores = {}
    best_pipelines = {}
    for algo, pipeline in pipelines.items():
            score = np.mean(cross_val_score(pipeline, X, y, cv=5))
            scores[algo] = score
        
# Save the best pipeline for each algorithm and their corresponding scores
    if algo not in best_pipelines or score > best_pipelines[algo]["score"]:
        best_pipelines[algo] = {"pipeline": pipeline, "score": score}
    
# Choose the best algorithm based on the cross-validation scores
    best_algo = max(scores, key=scores.get)
    best_pipeline = best_pipelines[best_algo]["pipeline"]
    best_params = {
            "k": k,
            f"{best_algo}_best_params": best_pipeline.named_steps["model"].get_params(),
            f"{best_algo}_mean_cv_score": scores[best_algo],
                    }
    # Save the best pipeline for each algorithm in MLflow
    best_mean_cv_score = None
    with mlflow.start_run(nested=True):
        for algo, pipeline_info in best_pipelines.items():
            pipeline = pipeline_info["pipeline"]
            pipeline_name = f"{algo}_pipeline"
            mlflow.sklearn.log_model(pipeline, pipeline_name)

            # Log the best params and score for this pipeline
            if algo == best_algo:
                mlflow.log_params(best_params)
                mlflow.log_metric("mean_cv_score", scores[best_algo])

    # Return the mean cross-validation score of the best algorithm
    return best_mean_cv_score

# Set up MLflow tracking
mlflow.set_tracking_uri("https://dagshub.com/inouyewilliam/Master-Thesis.mlflow")

# Start a new MLflow run to track the experiment
with mlflow.start_run():
    # Optimize the hyperparameters using Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    # Get the best hyperparameters and score
    best_params = study.best_params
    best_score = study.best_value
    
# Save the best model in MLflow
with mlflow.start_run(nested=True):
    # Train the best pipeline on the full dataset
    best_pipeline = best_pipelines[best_algo]["pipeline"]
    best_pipeline.fit(X_train, y_train)
    
    # Evaluate the model
    cv_score,roc_auc,average_precision,accuracy,precision,recall,f1 = evaluate_model(best_pipeline, X, y, X_test, y_test)

    # Log the pipeline and its parameters
    mlflow.sklearn.log_model(best_pipeline, "best_model")
    mlflow.log_params(best_params)
    mlflow.log_metric("mean_cv_score", cv_score)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("average_precision", average_precision)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1 score", f1)

    # Save the best model as a joblib file
    joblib.dump(best_pipeline, "best_model.joblib")

[32m[I 2023-05-08 22:11:40,031][0m A new study created in memory with name: no-name-e6fd96be-fc38-4c8e-8ce2-8992ebb29d3e[0m


Parameters: { "num_leaves" } are not used.

Parameters: { "num_leaves" } are not used.

Parameters: { "num_leaves" } are not used.

Parameters: { "num_leaves" } are not used.



[33m[W 2023-05-08 22:11:42,253][0m Trial 0 failed with parameters: {'k': 11, 'et_n_estimators': 121, 'et_max_depth': 10, 'lgbm_max_depth': 5, 'lgbm_n_estimators': 90, 'lgbm_num_leaves': 19, 'xgb_max_depth': 7, 'xgb_n_estimators': 79, 'xgb_num_leaves': 45} because of the following error: KeyError('et').[0m
Traceback (most recent call last):
  File "c:\Users\willi\anaconda3\envs\mlops\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\willi\AppData\Local\Temp\ipykernel_7308\3625654654.py", line 49, in objective
    best_pipeline = best_pipelines[best_algo]["pipeline"]
                    ~~~~~~~~~~~~~~^^^^^^^^^^^
KeyError: 'et'
[33m[W 2023-05-08 22:11:42,256][0m Trial 0 failed with value None.[0m


Parameters: { "num_leaves" } are not used.



KeyError: 'et'