## Log a Scikit-Learn Model

Log a Scikit-Learn Model with Katonic SDK's Log package.

### Imports

In [1]:
import os

import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, recall_score, f1_score, precision_score
from sklearn.linear_model import LogisticRegression
from katonic.log.logmodel import LogModel

### Experiment Name

In [2]:
experiment_name= "sklearn_model"

### Initiate LogModel with experiment name

In [4]:
lm = LogModel(experiment_name, source_name="scikit_learn_logging.ipynb")

2023/10/24 03:50:48 INFO mlflow.tracking.fluent: Experiment with name 'sklearn_model' does not exist. Creating a new experiment.


### Metadata of the created / existing experiment

In [5]:
# experiment id
exp_id = lm.id

print("experiment name: ", lm.name)
print("experiment location: ", lm.location)
print("experiment id: ", lm.id)
print("experiment status: ", lm.stage)

experiment name:  sklearn_model
experiment location:  s3://models/18
experiment id:  18
experiment status:  active


### Artifact path where you want to log your model

In [6]:
artifact_path = "scikit-learn-model"

### Read the data

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Get features and label

In [8]:
x = df.drop(columns=['Outcome'], axis=1)
y = df['Outcome']

### Split the dataset in Train and Test

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.20,random_state=98)

### Define Metric

In [10]:
def metric(actual, pred):
    acc_score = accuracy_score(actual, pred)
    recall = recall_score(actual, pred)
    precision_scr = precision_score(actual, pred)
    f1_scr = f1_score(actual, pred)
    auc_roc = roc_auc_score(actual, pred)
    log_los = log_loss(actual, pred)

    return (
        acc_score,
        auc_roc,
        log_los,
        recall,
        f1_scr,
        precision_scr
    )

### Random Forest

In [11]:
model_clf = RandomForestClassifier(max_depth=2, random_state=0)
model_clf.fit(X_train, y_train)

In [12]:
y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
    "accuracy_score": acc_score,
    "roc_auc_score": auc_roc,
    "log_loss": log_los,
    "recall": recall,
    "f1_score": f1_scr,
    "precision_score": precision_scr
}

### Log random forest model
- model_type can be scikit-learn, xgboost, catboost, lightgbm, prophet, keras, custom-model

In [13]:
lm.model_logging(
    model_name="random_forest",
    model_type="scikit-learn",
    model=model_clf,
    artifact_path=artifact_path,
    current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
    metrics=model_mertics
)

Model artifact logged to: s3://models/18/9dbe7b1db90d4dc28cc59fe76a073739/artifacts/sklearn_model_18_scikit-learn-model_random_forest


### Logistic Regression

In [14]:
model_clf = LogisticRegression(random_state=0)
model_clf.fit(X_train, y_train)

In [15]:
y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
    "accuracy_score": acc_score,
    "roc_auc_score": auc_roc,
    "log_loss": log_los,
    "recall": recall,
    "f1_score": f1_scr,
    "precision_score": precision_scr
}

### Log logistic Regression model

In [16]:
lm.model_logging(
    model_name="logistic_regression",
    model_type="scikit-learn",
    model=model_clf,
    artifact_path=artifact_path,
    current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
    metrics=model_mertics
)

Model artifact logged to: s3://models/18/b7cca315e1b8418bb0c20918c246f589/artifacts/sklearn_model_18_scikit-learn-model_logistic_regression


### Adaboost

In [17]:
model_clf = AdaBoostClassifier(random_state=0)
model_clf.fit(X_train, y_train)

In [18]:
y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
    "accuracy_score": acc_score,
    "roc_auc_score": auc_roc,
    "log_loss": log_los,
    "recall": recall,
    "f1_score": f1_scr,
    "precision_score": precision_scr
}

### Log Adaboost model

In [19]:
lm.model_logging(
    model_name="adaboostclassifier",
    model_type="scikit-learn",
    model=model_clf,
    artifact_path=artifact_path,
    current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
    metrics=model_mertics
)

Model artifact logged to: s3://models/18/6ffe0a62f86a47f4b38ea660d16b36e1/artifacts/sklearn_model_18_scikit-learn-model_adaboostclassifier


### Gradient Boost Model

In [20]:
model_clf = GradientBoostingClassifier(random_state=0)
model_clf.fit(X_train, y_train)

In [21]:
y_pred = model_clf.predict(X_test)
(acc_score, auc_roc, log_los, recall, f1_scr, precision_scr) = metric(y_test, y_pred)

model_mertics = {
    "accuracy_score": acc_score,
    "roc_auc_score": auc_roc,
    "log_loss": log_los,
    "recall": recall,
    "f1_score": f1_scr,
    "precision_score": precision_scr
}

### Log Gradientboost model

In [22]:
lm.model_logging(
    model_name="gradientboostclassifier",
    model_type="scikit-learn",
    model=model_clf,
    artifact_path=artifact_path,
    current_working_dir=f'{os.getcwd()}/scikit_learn_logging.ipynb',
    metrics=model_mertics
)

Model artifact logged to: s3://models/18/7a5ca1ce150242a0bf6dace6708f366d/artifacts/sklearn_model_18_scikit-learn-model_gradientboostclassifier


### Check Loaded Models

In [23]:
df_runs = lm.search_runs(exp_id)
print("Number of runs done : ", len(df_runs))

Number of runs done :  4


In [24]:
df_runs.head()

Unnamed: 0,artifact_uri,end_time,experiment_id,metrics.accuracy_score,metrics.f1_score,metrics.log_loss,metrics.precision_score,metrics.recall,metrics.roc_auc_score,run_id,run_name,start_time,status,tags.data_path,tags.experiment_id,tags.experiment_name,tags.features,tags.mlflow.log-model.history,tags.run_id,tags.version.mlflow
0,s3://models/18/7a5ca1ce150242a0bf6dace6708f366...,2023-10-24 03:51:45.299000+00:00,18,0.818182,0.695652,6.279829,0.761905,0.64,0.771923,7a5ca1ce150242a0bf6dace6708f366d,sklearn_model_18_scikit-learn-model_gradientbo...,2023-10-24 03:51:43.494000+00:00,FINISHED,-,18,sklearn_model,-,"[{""run_id"": ""7a5ca1ce150242a0bf6dace6708f366d""...",7a5ca1ce150242a0bf6dace6708f366d,2.0.1
1,s3://models/18/6ffe0a62f86a47f4b38ea660d16b36e...,2023-10-24 03:51:31.263000+00:00,18,0.792208,0.666667,7.176961,0.695652,0.64,0.752692,6ffe0a62f86a47f4b38ea660d16b36e1,sklearn_model_18_scikit-learn-model_adaboostcl...,2023-10-24 03:51:29.475000+00:00,FINISHED,-,18,sklearn_model,-,"[{""run_id"": ""6ffe0a62f86a47f4b38ea660d16b36e1""...",6ffe0a62f86a47f4b38ea660d16b36e1,2.0.1
2,s3://models/18/b7cca315e1b8418bb0c20918c246f58...,2023-10-24 03:51:26.725000+00:00,18,0.792208,0.636364,7.176941,0.736842,0.56,0.731923,b7cca315e1b8418bb0c20918c246f589,sklearn_model_18_scikit-learn-model_logistic_r...,2023-10-24 03:51:24.986000+00:00,FINISHED,-,18,sklearn_model,-,"[{""run_id"": ""b7cca315e1b8418bb0c20918c246f589""...",b7cca315e1b8418bb0c20918c246f589,2.0.1
3,s3://models/18/9dbe7b1db90d4dc28cc59fe76a07373...,2023-10-24 03:51:19.873000+00:00,18,0.772727,0.477612,7.849727,0.941176,0.32,0.655192,9dbe7b1db90d4dc28cc59fe76a073739,sklearn_model_18_scikit-learn-model_random_forest,2023-10-24 03:51:17.258000+00:00,FINISHED,-,18,sklearn_model,-,"[{""run_id"": ""9dbe7b1db90d4dc28cc59fe76a073739""...",9dbe7b1db90d4dc28cc59fe76a073739,2.0.1
