In [None]:
# imports
import time

import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
import mlflow
mlflow.set_tracking_uri("./mlflowwork")
mlflow.set_experiment(experiment_name = "mymodel2")
experiment = mlflow.get_experiment_by_name("mymodel2")

In [None]:
# Data Preprocessing
def preprocess_data(df, categorical_cols, float_cols):

    df[categorical_cols] = df[categorical_cols].astype('category')
    df[float_cols] = df[float_cols].astype('float')

    X = df.drop(["Survived", "PassengerId"], axis=1)
    y = df["Survived"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc, categorical_cols


# Model Training
def train_model(params, num_boost_round, X_train, X_test, y_train, y_test, categorical_cols):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_cols)
    test_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_cols)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
        categorical_feature=categorical_cols
    )
    t2 = time.time()

    return model, t2 - t1


# Model Evaluation
def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

In [None]:
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # MLflow Automatic Logging for LightGBM
    mlflow.lightgbm.autolog()
    
    # LightGBM HyperParameters
    num_boost_round = 10

    params = {
        "objective": "multiclass",
        "num_class": 2,
        "boosting": "gbdt",
        "num_iterations": 16,
        "num_leaves": 31,
        "num_threads": 0,
        "learning_rate": 0.1,
        "metric": "multi_logloss",
        "seed": 1234,
        "verbose": 0,
    }

    # Read csv file
    df = pd.read_csv("../artifacts/sample-data/Titanic.csv")

    # Data Preprocessing
    categorical_cols = ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
    float_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
    X_train, X_test, y_train, y_test, enc, categorical_cols = preprocess_data(df, categorical_cols, float_cols)

    # Model Training
    model, train_time = train_model(
        params, num_boost_round, X_train, X_test, y_train, y_test, categorical_cols
    )

    # Model Evaluation
    loss, acc = evaluate_model(model, X_test, y_test)
    print(loss, acc)

In [None]:
#!mlflow ui  --backend-store-uri ./mlflowwork # Start MLflow UI on local