In [1]:
# !pip install mlflow


In [11]:
from ast import arg
import mlflow
import mlflow.sklearn
import pandas as pd
from mlflow.exceptions import MlflowException
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import datasets
import argparse
import random

import lightgbm as lgb


experiment_name = 'Jawhar_notebook_test'
tracking_uri = 'https://mlflow.qa.healthcare.com/'

# Tracking URL must be set before creating experiment, else the first run will be a local
mlflow.set_tracking_uri(tracking_uri)
if not mlflow.get_experiment_by_name(experiment_name):
    print(f'Experiment {experiment_name} Not Found')
    try:
        mlflow.create_experiment(experiment_name,'s3://hc-qa-mlflow-bucket')
    except MlflowException as ex:
        print(f"Error creating experiment {ex}")

mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://hc-qa-mlflow-bucket', experiment_id='76', lifecycle_stage='active', name='Jawhar_notebook_test', tags={}>

In [12]:
df = pd.read_csv('ma_postconv_jorn_zcta_tu.csv',low_memory=False)
# Define numeric and categorical features
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

In [13]:

for feature in categorical_columns:
    df[feature] = pd.Series(df[feature], dtype="category")

df['all_LTV'] = df['mod_LTV'].fillna(0)
y = df['all_LTV']
X = df
X = X.drop(['owner_phone',  'sk_referral_flag',  'lead_id', 'post_raw_cancellation_model_prediction',  'post_raw_probability_of_cancellation', 'post_raw_duration_model_prediction', 'post_raw_LTV',  'post_raw_coverage_duration', 'mod_LTV','all_LTV','application_id','owner_email', 'application_name','policy_id', 'owner_id', 'pol_zip_code', 'parent_application_id', 'bk_product_type', 'carrier', 'first_name', 'last_name', 'post_raw_application_id', 'post_raw_medicare_number', 'post_raw_policy_id', 'jrn_boberdoo_source','jrn_date'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X ,y , test_size=0.30)

hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['mae', 'rmse'],
    'learning_rate': 0.005,
    "num_leaves": 128,  
    "max_bin": 512,
}
# iris = datasets.load_iris()
# x = iris.data[:, 2:]
# y = iris.target
# X_train, X_test, y_train, y_test = train_test_split(
#     x, y, test_size=0.3, random_state=42)

In [19]:
from sklearn.metrics import mean_absolute_error, r2_score, median_absolute_error
from sklearn.metrics import mean_squared_error as MSE
import numpy as np

with mlflow.start_run() as run:

    estimator = random.randint(1, 11)
    min_samples_leaf = random.randint(1, 5)
    # params = {
    #     "n-estimators": estimator,
    #     "min-samples-leaf": 3,
    #     "features": X_train.columns
    # }

    mlflow.set_tag("developer", "Jawhar")

    # train the model
    
    lgb_train = lgb.Dataset(X_train, y_train)
    gbm = lgb.train(hyper_params, lgb_train, num_boost_round=10, verbose_eval=False)
    predictions = gbm.predict(X_test)
    
    # rf = RandomForestRegressor(
    #     n_estimators=estimator, min_samples_leaf=min_samples_leaf)
    # rf.fit(X_train, y_train)
    # predictions = rf.predict(X_test)

    model_info = mlflow.sklearn.log_model(gbm, 'LGBM')
    mlflow.log_params(hyper_params)

    # log model performance
    mse = mean_squared_error(y_test, predictions)
    print(f'mae: {mean_absolute_error(y_test, predictions)}')
    print(f'median absolute error: {median_absolute_error(y_test, predictions)}')
    print(f'r2: {r2_score(y_test, predictions)}')
    rmse = np.sqrt(MSE(y_test, predictions))
    print("RMSE : % f" %(rmse))
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('mae', mean_absolute_error(y_test, predictions))
    mlflow.log_metric('median absolute error', median_absolute_error(y_test, predictions))
    mlflow.log_metric('R2', r2_score(y_test, predictions))
    print(mse)

    # mlflow.log_artifact("testartifacts.txt",artifact_path=mlflow.get_artifact_uri())

    dictionary = {"k": "v"}
    mlflow.log_dict(dictionary, "data.json")
    # Log a dictionary as a YAML file in a subdirectory of the run's root artifact directory
    mlflow.log_dict(dictionary, "dir/data.yml")

    # If the file extension doesn't exist or match any of [".json", ".yaml", ".yml"],
    # JSON format is used.
    mlflow.log_dict(dictionary, "data")
    mlflow.log_dict(dictionary, "data.txt")


    run_id = mlflow.active_run().info.run_id
    experiment = mlflow.get_experiment_by_name(experiment_name)
    print("Experiment_id: {}".format(experiment.experiment_id))
    print("Artifact Location: {}".format(experiment.artifact_location))
    print("Tags: {}".format(experiment.tags))
    print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
    print(f'artifact_uri = {mlflow.get_artifact_uri()}')
    print(f'runID: {run_id}')
    
mlflow.end_run()



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 31415
[LightGBM] [Info] Number of data points in the train set: 14730, number of used features: 162
[LightGBM] [Info] Start training from score 501.211307
mae: 350.0647064649992
median absolute error: 317.09425862488786
r2: 0.021889441791020814
RMSE :  372.322997
138624.41419042094
Experiment_id: 76
Artifact Location: s3://hc-qa-mlflow-bucket
Tags: {}
Lifecycle_stage: active
artifact_uri = s3://hc-qa-mlflow-bucket/e0917c710b4f4d678b613795f1a4d670/artifacts
runID: e0917c710b4f4d678b613795f1a4d670
