In [39]:
import pandas as pd
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import optuna

from giskard import Dataset, Model, scan, testing
import pickle

from minio import Minio
from minio.error import S3Error
import warnings

# import clickhouse_connect
import psycopg2

from functools import reduce

import clickhouse_connect

import mlflow 
import mlflow.catboost
import mlflow.sklearn
import mlflow.data
from mlflow.models import infer_signature

from datetime import datetime

# Env variables

In [40]:
from dotenv import load_dotenv
import os

load_dotenv()

SPARK_COMPAT_VERSION = os.getenv('SPARK_COMPAT_VERSION')
SCALA_COMPAT_VERSION = os.getenv('SCALA_COMPAT_VERSION')
CATBOOST_SPARK_VERSION = os.getenv('CATBOOST_SPARK_VERSION')
DB_HOST = os.getenv('DB_HOST')
POSTGRESQL_PORT = os.getenv('POSTGRESQL_PORT')
CLICKHOUSE_PORT = os.getenv('CLICKHOUSE_PORT')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_NAME = os.getenv('DB_NAME')

ACCESS_KEY=os.getenv('MINIO_ROOT_USER')
SECRET_KEY=os.getenv('MINIO_ROOT_PASSWORD')

# Set feature and target columns

In [42]:
COLUMN_TYPES = {
    'age': 'category',
    'sex': 'category',
    'job': 'category',
    'housing': 'category',
    'credit_amount': 'numeric',
    'duration': 'numeric'
}

TARGET_COLUMN_NAME = 'default'
FEATURE_COLUMNS = [i for i in COLUMN_TYPES.keys()]
FEATURE_TYPES = {i: COLUMN_TYPES[i] for i in COLUMN_TYPES if i != TARGET_COLUMN_NAME}

COLUMNS_TO_SCALE = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "numeric"]
COLUMNS_TO_ENCODE = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "category"]

# Connect to db (data) and get data

In [43]:
# client = clickhouse_connect.get_client(host = CLICKHOUSE_HOST, 
#                                        port = CLICKHOUSE_PORT, 
#                                        user = CLICKHOUSE_USER, 
#                                        password = CLICKHOUSE_PASSWORD)
# query = fr'''
# select {reduce(lambda a,b: a + ', ' + b, FEATURE_COLUMNS)}
# from credit.credit
# '''
# X = pd.DataFrame(client.query(query).named_results())
# X

job_list = {
    0: 'unskilled and non-resident', 
    1: 'unskilled and resident', 
    2: 'skilled', 
    3: 'highly skilled'
}


## Postgresql ==============================================
conn = psycopg2.connect(dbname='credit',
                                user=DB_USER,
                                password=DB_PASSWORD,
                                host='localhost',
                                port=POSTGRESQL_PORT)
cur = conn.cursor()
cur.execute(f"SELECT {reduce(lambda a,b: a + ', ' + b, FEATURE_COLUMNS)} FROM credit;")
X = (
    pd
    .DataFrame(cur.fetchall(), columns=FEATURE_COLUMNS)
    .assign(job = lambda x: x['job'].apply(lambda x: job_list[x]))
)
conn.commit()
conn.close()


conn = psycopg2.connect(dbname='credit',
                                user=DB_USER,
                                password=DB_PASSWORD,
                                host='localhost',
                                port=POSTGRESQL_PORT)
cur = conn.cursor()
cur.execute(f'SELECT cr."{TARGET_COLUMN_NAME}" FROM credit cr;')
y = [x[0] for x in cur.fetchall()]
conn.commit()
conn.close()

df = X.join(pd.DataFrame(y, columns = [TARGET_COLUMN_NAME]))
#=================================================================


# MLflow connection 

In [44]:
warnings.filterwarnings('ignore')
mlflow.set_tracking_uri('http://localhost:5000/')
print("URI", mlflow.get_tracking_uri())

URI http://localhost:5000/


# MLflow experiment name

In [45]:
experiment_name = f'credit_predict_{datetime.now().strftime('01.%m.%Y')}'
try:
    mlflow.create_experiment(experiment_name, artifact_location=f's3://mlflow')
except:
    pass
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflow/', creation_time=1747062720348, experiment_id='39', last_update_time=1747062720348, lifecycle_stage='active', name='credit_predict_01.05.2025', tags={}>

# Model creation

In [46]:
numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
# 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, COLUMNS_TO_SCALE),
        ("cat", categorical_transformer, COLUMNS_TO_ENCODE)
    ]
)

In [47]:
X_preproccessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.25,
    random_state=42
)

### Optimize hyperparams (optuna)

In [48]:
def objective(trial):    
    # CatBoostClassifier hyperparams
    params = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }
    with mlflow.start_run(run_name = f'run_{datetime.now().strftime('%d.%m.%Y_%H:%M:%S')}', nested=True):
        
        # model
        mlflow.log_params(params)
        estimator = CatBoostClassifier(**params, verbose=False)
        
        accuracy = cross_val_score(estimator, X_preproccessed, y, cv=3, scoring= 'accuracy').mean()
        mlflow.log_metric('Accuracy', accuracy) 
        return accuracy


experiment_name = f'credit_predict_{datetime.now().strftime('01.%m.%Y')}'
try:
    mlflow.create_experiment(experiment_name, artifact_location=f's3://mlflow/artifacts_{datetime.now().strftime('%d.%m.%Y_%H:%M')}/')
except:
    pass
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name = f'params_opt_{datetime.now().strftime('%d.%m.%Y_%H:%M')}') as run:
    study = optuna.create_study(direction="maximize", study_name=f"params_opt_{datetime.now().strftime('%Y%m%d-%H%M%S')}")

    # Hyperparams searching
    study.optimize(objective, n_trials=3)
    
    # best result is
    params = study.best_params

    estimator = CatBoostClassifier(**params, verbose=False)  
    catboostclassifier = Pipeline(steps = [
        ("preprocessor", preprocessor),
        ("classifier", estimator)
    ])
    # catboostclassifier = CatBoostClassifier(**params, verbose=False)
    catboostclassifier.fit(X_train, y_train)
    
    pred_test = catboostclassifier.predict(X_test)
    signature = infer_signature(X_test, pred_test)

    
    metrics = {'accuracy': accuracy_score(pred_test, y_test),
                'precision': precision_score(pred_test, y_test),
                'recall': recall_score(pred_test, y_test),
                'f1': f1_score(pred_test, y_test),
                'roc_auc': roc_auc_score(y_test, catboostclassifier.predict_proba(X_test)[:,1])
              }

    
    input_example = X_train.iloc[[0], :]
    mlflow.models.infer_signature(input_example, 0, params)
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)

    dataset = mlflow.data.from_pandas(df, name='german_credit', targets='default')
    mlflow.log_input(dataset)
    # mlflow.evaluate(data=df, model_type="classifier", targets=TARGET_COLUMN_NAME, model=catboostclassifier)

    mlflow.sklearn.log_model(sk_model=catboostclassifier, 
                              artifact_path='catboostclassifier', 
                              signature=signature,
                              input_example=input_example
                             )
    # mlflow.sklearn.save_model(sk_model=catboostclassifier, 
    #                            path='catboostclassifier',
    #                           signature=signature,
    #                           input_example=input_example
    #                          )

[I 2025-05-12 19:13:30,391] A new study created in memory with name: params_opt_20250512-191330
[I 2025-05-12 19:13:37,675] Trial 0 finished with value: 0.7129974285662909 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08594614671522907, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7129974285662909.


🏃 View run run_12.05.2025_19:13:30 at: http://localhost:5000/#/experiments/39/runs/7789c9bad2e84405a1bde47b33b853f5
🧪 View experiment at: http://localhost:5000/#/experiments/39


[I 2025-05-12 19:13:41,383] Trial 1 finished with value: 0.7239784694874515 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08479310773029916, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian'}. Best is trial 1 with value: 0.7239784694874515.


🏃 View run run_12.05.2025_19:13:37 at: http://localhost:5000/#/experiments/39/runs/1949d2b86d3a4c909509224817b849f7
🧪 View experiment at: http://localhost:5000/#/experiments/39


[I 2025-05-12 19:13:44,253] Trial 2 finished with value: 0.7129884375393357 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09390296274221702, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian'}. Best is trial 1 with value: 0.7239784694874515.


🏃 View run run_12.05.2025_19:13:41 at: http://localhost:5000/#/experiments/39/runs/3d0301b067af46128c42c9b637eedf2a
🧪 View experiment at: http://localhost:5000/#/experiments/39


Downloading artifacts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1748.67it/s]


🏃 View run params_opt_12.05.2025_19:13 at: http://localhost:5000/#/experiments/39/runs/86b9f8d9746c4089ae513e57d386c41e
🧪 View experiment at: http://localhost:5000/#/experiments/39


S3UploadFailedError: Failed to upload C:\Users\D01D0~1.KON\AppData\Local\Temp\tmphkt8wgfl\model\conda.yaml to mlflow/86b9f8d9746c4089ae513e57d386c41e/artifacts/catboostclassifier/conda.yaml: An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.

In [49]:
with mlflow.start_run(run_name='run ' + datetime.now().strftime('%d.%m.%Y %H:%M')) as run:
    estimator = CatBoostClassifier(**params, verbose=False)
    
    catboostclassifier = Pipeline(steps = [
        ("preprocessor", preprocessor),
        ("classifier", estimator)
    ])

    
    # catboostclassifier = CatBoostClassifier(**params, verbose=False)
    catboostclassifier.fit(X_train, y_train)
    
    pred_test = catboostclassifier.predict(X_test)
    signature = infer_signature(X_test, pred_test)

    
    metrics = {'accuracy': accuracy_score(pred_test, y_test),
                'precision': precision_score(pred_test, y_test),
                'recall': recall_score(pred_test, y_test),
                'f1': f1_score(pred_test, y_test),
                'roc_auc': roc_auc_score(y_test, catboostclassifier.predict_proba(X_test)[:,1])
              }

    
    input_example = X_train.iloc[[0], :]
    mlflow.models.infer_signature(input_example, 0, params)
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)

    dataset = mlflow.data.from_pandas(df, name='german_credit', targets='default')
    mlflow.log_input(dataset)
    # mlflow.evaluate(data=df, model_type="classifier", targets=TARGET_COLUMN_NAME, model=catboostclassifier)
    
    mlflow.sklearn.log_model(catboostclassifier, 
                              'catboostclassifier', 
                              signature=signature,
                              input_example=input_example
                             )
    # mlflow.sklearn.save_model(sk_model=catboostclassifier, 
    #                            path='catboostclassifier',
    #                           signature=signature,
    #                           input_example=input_example
    #                          )
    
    


Downloading artifacts: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1750.86it/s]


🏃 View run run 12.05.2025 19:16 at: http://localhost:5000/#/experiments/39/runs/6229ed93b28e4861b21200035c926729
🧪 View experiment at: http://localhost:5000/#/experiments/39


S3UploadFailedError: Failed to upload C:\Users\D01D0~1.KON\AppData\Local\Temp\tmpjhhwpjnf\model\conda.yaml to mlflow/6229ed93b28e4861b21200035c926729/artifacts/catboostclassifier/conda.yaml: An error occurred (InvalidAccessKeyId) when calling the PutObject operation: The AWS Access Key Id you provided does not exist in our records.

## Wrap dataset with Giskard

In [None]:
raw_data = pd.concat([X_test, y_test], axis = 1)
giskard_dataset = Dataset(
    df = raw_data,
    target=TARGET_COLUMN_NAME,
    name = "German credit scoring dataset",
    cat_columns=COLUMNS_TO_ENCODE
)

## Wrap model with Giskard

In [None]:
giskard_model = Model(
    model=catboostclassifier,
    model_type="classification",     # Either regression, classification or text_generation.
    name="Chunk classification",
    classification_labels=catboostclassifier.classes_,  # Their order MUST be identical to the prediction_function's output order
    feature_names=FEATURE_COLUMNS     # Default: all columns of your dataset
)

## Scan model with Giskard

In [None]:
results = scan(giskard_model, giskard_dataset, verbose=False)
results.to_html("giskard_scan_result.html")

# Save model

In [39]:
with open('model.pkl', 'wb') as f:
    pickle.dump(catboostclassifier, f)

# Minio

In [52]:
def s3_upload_model():
    # Create a client with the MinIO server playground, its access key
    # and secret key.
    client = Minio("localhost:9099",
        access_key=ACCESS_KEY,
        secret_key=SECRET_KEY,
        secure=False
    )

    # The file to upload, change this path if needed
    source_file = "model.pkl"

    # The destination bucket and filename on the MinIO server
    bucket_name = "credit-model-new-new"
    destination_file = "model.pkl"

    # Make the bucket if it doesn't exist.
    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)
        print("Created bucket", bucket_name)
    else:
        print("Bucket", bucket_name, "already exists")

    # Upload the file, renaming it in the process
    client.fput_object(
        bucket_name, destination_file, source_file,
    )
    print(
        source_file, "successfully uploaded as object",
        destination_file, "to bucket", bucket_name,
    )

try:
    s3_upload_model()
except S3Error as exc:
    print("error occurred.", exc)

Created bucket credit-model-new-new
model.pkl successfully uploaded as object model.pkl to bucket credit-model-new-new
