In [86]:
import pandas as pd
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import optuna

from giskard import Dataset, Model, scan, testing
import pickle

from minio import Minio
from minio.error import S3Error
import warnings

# import clickhouse_connect
import psycopg2

from functools import reduce

import clickhouse_connect

import mlflow 
import mlflow.catboost

from datetime import datetime

# Env variables

In [58]:
from dotenv import load_dotenv
import os

load_dotenv()

SPARK_COMPAT_VERSION = os.getenv('SPARK_COMPAT_VERSION')
SCALA_COMPAT_VERSION = os.getenv('SCALA_COMPAT_VERSION')
CATBOOST_SPARK_VERSION = os.getenv('CATBOOST_SPARK_VERSION')
DB_HOST = os.getenv('DB_HOST')
POSTGRESQL_PORT = os.getenv('POSTGRESQL_PORT')
CLICKHOUSE_PORT = os.getenv('CLICKHOUSE_PORT')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')
DB_NAME = os.getenv('DB_NAME')

ACCESS_KEY=os.getenv('MINIO_ACCESS_KEY')
SECRET_KEY=os.getenv('MINIO_SECRET_KEY')

# Set feature and target columns

In [7]:
COLUMN_TYPES = {
    'age': 'category',
    'sex': 'category',
    'job': 'category',
    'housing': 'category',
    'credit_amount': 'numeric',
    'duration': 'numeric'
}

TARGET_COLUMN_NAME = 'default'
FEATURE_COLUMNS = [i for i in COLUMN_TYPES.keys()]
FEATURE_TYPES = {i: COLUMN_TYPES[i] for i in COLUMN_TYPES if i != TARGET_COLUMN_NAME}

COLUMNS_TO_SCALE = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "numeric"]
COLUMNS_TO_ENCODE = [key for key in COLUMN_TYPES.keys() if COLUMN_TYPES[key] == "category"]

# Connect to db (data) and get data

In [105]:
# client = clickhouse_connect.get_client(host = CLICKHOUSE_HOST, 
#                                        port = CLICKHOUSE_PORT, 
#                                        user = CLICKHOUSE_USER, 
#                                        password = CLICKHOUSE_PASSWORD)
# query = fr'''
# select {reduce(lambda a,b: a + ', ' + b, FEATURE_COLUMNS)}
# from credit.credit
# '''
# X = pd.DataFrame(client.query(query).named_results())
# X


## Postgresql ==============================================
conn = psycopg2.connect(dbname='credit',
                                user=DB_USER,
                                password=DB_PASSWORD,
                                host='localhost',
                                port=POSTGRESQL_PORT)
cur = conn.cursor()
cur.execute(f"SELECT {reduce(lambda a,b: a + ', ' + b, FEATURE_COLUMNS)} FROM credit;")
X = pd.DataFrame(cur.fetchall(), columns=FEATURE_COLUMNS)
conn.commit()
conn.close()


conn = psycopg2.connect(dbname='credit',
                                user=DB_USER,
                                password=DB_PASSWORD,
                                host='localhost',
                                port=POSTGRESQL_PORT)
cur = conn.cursor()
cur.execute(f'SELECT cr."{TARGET_COLUMN_NAME}" FROM credit cr;')
y = [x[0] for x in cur.fetchall()]
conn.commit()
conn.close()
#=================================================================


# MLflow connection 

In [112]:
warnings.filterwarnings('ignore')
mlflow.set_tracking_uri('http://localhost:5000/')
print("URI", mlflow.get_tracking_uri())
mlflow.set_experiment('credit_prediction')

URI http://localhost:5000/


<Experiment: artifact_location='s3://mlflow/1', creation_time=1746697306437, experiment_id='1', last_update_time=1746697306437, lifecycle_stage='active', name='credit_prediction', tags={}>

# Model creation

In [68]:
numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, COLUMNS_TO_SCALE),
        ("cat", categorical_transformer, COLUMNS_TO_ENCODE)
    ]
)

In [79]:
X_preproccessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(
    X_preproccessed, y,
    test_size = 0.25,
    random_state=42
)

### Optimize hyperparams (optuna)

In [113]:
def objective(trial):    
    # CatBoostClassifier hyperparams
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    # model
    estimator = CatBoostClassifier(**param, verbose=False)
    
    accuracy = cross_val_score(estimator, X_preproccessed, y, cv=3, scoring= 'accuracy').mean()
    return accuracy

#study = optuna.create_study(direction="maximize", study_name="CBC-2023-01-14-14-30", storage='sqlite:///db/CBC-2023-01-14-14-30.db')
# study = optuna.create_study(direction="maximize", study_name=f"{datetime.now().strftime('%Y%m%d-%H%M%S')}")
study = optuna.create_study(direction="maximize", study_name="credit_classifier")

# Hyperparams searching
study.optimize(objective, n_trials=10)

# best result is
params = study.best_params
params

[I 2025-05-08 13:19:19,568] A new study created in memory with name: credit_classifier
[I 2025-05-08 13:21:10,439] Trial 0 finished with value: 0.7119904335473198 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08865040112550227, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7119904335473198.
[I 2025-05-08 13:21:17,322] Trial 1 finished with value: 0.7089814365263467 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.09890063664170431, 'depth': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli'}. Best is trial 0 with value: 0.7119904335473198.
[I 2025-05-08 13:22:00,074] Trial 2 finished with value: 0.7069854285423148 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.03160657539492338, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli'}. Best is trial 0 with value: 0.7119904335473198.
[I 2025-05-08 13:22:41,867] Trial 3 finished with value: 0.71798145450840

{'objective': 'CrossEntropy',
 'colsample_bylevel': 0.07702939510039258,
 'depth': 3,
 'boosting_type': 'Plain',
 'bootstrap_type': 'Bernoulli'}

In [114]:
with mlflow.start_run(run_name='run ' + datetime.now().strftime('%d.%m.%Y %H:%M')):
    catboostclassifier = CatBoostClassifier(**params, verbose=False)
    catboostclassifier.fit(X_train, y_train)
    
    pred_test = catboostclassifier.predict(X_test)

    
    metrics = {'accuracy': accuracy_score(pred_test, y_test),
                'precision': precision_score(pred_test, y_test),
                'recall': recall_score(pred_test, y_test),
                'f1': f1_score(pred_test, y_test),
                'roc_auc': roc_auc_score(y_test, catboostclassifier.predict_proba(X_test)[:,1])
              }
    # mlflow.models.infer_signature()
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.catboost.log_model(catboostclassifier, 'catboostclassifier')
    
    




🏃 View run run 08.05.2025 13:24 at: http://localhost:5000/#/experiments/1/runs/c4458165d8644c49aa6c984c33fd654f
🧪 View experiment at: http://localhost:5000/#/experiments/1


NoCredentialsError: Unable to locate credentials

## Wrap dataset with Giskard

In [None]:
raw_data = pd.concat([X_test, y_test], axis = 1)
giskard_dataset = Dataset(
    df = raw_data,
    target=TARGET_COLUMN_NAME,
    name = "German credit scoring dataset",
    cat_columns=COLUMNS_TO_ENCODE
)

## Wrap model with Giskard

In [None]:
giskard_model = Model(
    model=catboostclassifier,
    model_type="classification",     # Either regression, classification or text_generation.
    name="Chunk classification",
    classification_labels=catboostclassifier.classes_,  # Their order MUST be identical to the prediction_function's output order
    feature_names=FEATURE_COLUMNS     # Default: all columns of your dataset
)

## Scan model with Giskard

In [None]:
results = scan(giskard_model, giskard_dataset, verbose=False)
results.to_html("giskard_scan_result.html")

# Save model

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(catboostclassifier, f)

# Minio

In [None]:
def s3_upload_model():
    # Create a client with the MinIO server playground, its access key
    # and secret key.
    client = Minio("localhost:9099",
        access_key=ACCESS_KEY,
        secret_key=SECRET_KEY,
        secure=False
    )

    # The file to upload, change this path if needed
    source_file = "model.pkl"

    # The destination bucket and filename on the MinIO server
    bucket_name = "credit-model"
    destination_file = "model.pkl"

    # Make the bucket if it doesn't exist.
    found = client.bucket_exists(bucket_name)
    if not found:
        client.make_bucket(bucket_name)
        print("Created bucket", bucket_name)
    else:
        print("Bucket", bucket_name, "already exists")

    # Upload the file, renaming it in the process
    client.fput_object(
        bucket_name, destination_file, source_file,
    )
    print(
        source_file, "successfully uploaded as object",
        destination_file, "to bucket", bucket_name,
    )

try:
    s3_upload_model()
except S3Error as exc:
    print("error occurred.", exc)