In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
import joblib

In [24]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from hyperopt import space_eval
import mlflow
from  mlflow.tracking import MlflowClient
import lightgbm as lgb

In [25]:
PROJECT_NAME = f"diabetes_prediction_project"
mlflow.set_tracking_uri("http://78.47.114.227:7060")
mlflow.set_experiment(experiment_name=PROJECT_NAME)

<Experiment: artifact_location='s3://mlflowrunss3/mlflow/5', creation_time=1690388694259, experiment_id='5', last_update_time=1690388694259, lifecycle_stage='active', name='diabetes_prediction_project', tags={}>

In [26]:
data = pd.read_csv('../dataset/diabetes.csv')

In [27]:
data['new_feature_1'] = data['BMI'] / data['DiabetesPedigreeFunction']
data['new_feature_2'] = data['Glucose'] + data['BloodPressure']

In [28]:
X_train = data.drop(['Outcome'], axis=1)
y_train = data['Outcome']

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
    test_size=0.25, random_state= 8)

In [30]:
train = lgb.Dataset(X_train, label=y_train)
valid = lgb.Dataset(X_val, label=y_val)

In [32]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "lgbm")
        mlflow.log_params(params)
        booster = lgb.train(
            params=params,
            train_set=train,
            num_boost_round=10,
            feval=[(X_val, 'validation')]
            #early_stopping_rounds=50
        )
        y_pred = booster.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [33]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944                      
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 10
[LightGBM] [Info] Start training from score 0.343750  
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944                                                 
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 10
[LightGBM] [Info] Start training from score 0.343750                             
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944                                                 
[LightGBM] [Info] Number of data points in the train set: 576, number of used features: 10
[LightGBM] [Info] Start training from score 0.343750                             
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944                         

In [37]:
PROJECT_NAME = "diabetes_prediction_project"
mlflow.set_tracking_uri("http://78.47.114.227:7060")
client = MlflowClient()
mlflow.set_experiment(PROJECT_NAME)

registered_model_name=f"name='{PROJECT_NAME}'"
registered_model_uri = client.search_model_versions(registered_model_name)[-1].source
print(registered_model_uri)
loaded_model = mlflow.lightgbm.load_model(model_uri=registered_model_uri)
print(loaded_model)
print(loaded_model.get_params())

s3://mlflowrunss3/mlflow/5/2beec30cffba4e74988e876a44964095/artifacts/artifact
LGBMClassifier()
{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
