In [10]:
import pandas as pd

import optuna
from optuna.storages import RDBStorage

import mlflow

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

from sklearn.metrics import classification_report

import uuid

In [11]:
def display_classification_report(model, metric, X_train, X_test, y_train, y_test):
    y_train_pred_proba = model.predict_proba(X_train)
    y_test_pred_proba = model.predict_proba(X_test)
    roc_auc_score_train = round(roc_auc_score(y_train, y_train_pred_proba,average='weighted',multi_class='ovr'),2)
    roc_auc_score_test = round(roc_auc_score(y_test, y_test_pred_proba,average='weighted',multi_class='ovr'),2)
    print("ROC AUC Score Train:", roc_auc_score_train)
    print("ROC AUC Score Test:", roc_auc_score_test)
    metric.append(roc_auc_score_train)
    metric.append(roc_auc_score_test)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
  
    precision_train,recall_train,fscore_train,support_train=precision_recall_fscore_support(y_train,y_train_pred,average='weighted')
    precision_test,recall_test,fscore_test,support_test=precision_recall_fscore_support(y_test,y_test_pred,average='weighted')
    
    acc_score_train = round(accuracy_score(y_train,y_train_pred),2)
    acc_score_test = round(accuracy_score(y_test,y_test_pred),2)
    
    metric.append(acc_score_train)
    metric.append(acc_score_test)
    metric.append(round(precision_train,2))
    metric.append(round(precision_test,2))
    metric.append(round(recall_train,2))
    metric.append(round(recall_test,2))
    metric.append(round(fscore_train,2))
    metric.append(round(fscore_test,2))
    
    print('Train Accuracy :',acc_score_train)
    print('Test Accuracy :',acc_score_test)
        
    model_report_train = classification_report(y_train,y_train_pred)
    model_report_test = classification_report(y_test,y_test_pred)
    
    print('Classification Report for Train:\n',model_report_train)
    print('Classification Report for Test:\n',model_report_test)

In [12]:
def objective(trial):
    # Параметры, которые будут оптимизироваться
    C = trial.suggest_loguniform('C', 0.01, 10.0)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    
    # Загрузка данных и разделение на обучающую и тестовую выборки
    training_data = pd.read_csv('training_data.csv')[['complaint_text', 'Topic']]
    training_data['complaint_text'] = training_data['complaint_text'].astype(str)

    X = training_data.complaint_text
    y = training_data.Topic

    count_vect = CountVectorizer()
    X_vect = count_vect.fit_transform(X)

    tfidf_transformer = TfidfTransformer()
    X_tfidf = tfidf_transformer.fit_transform(X_vect)

    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25, random_state=40, stratify=y)

    model = LogisticRegression(random_state=40,solver='liblinear', C=C, max_iter=max_iter)
    model.fit(X_train, y_train)
    
    # Вычисление метрики точности на тестовой выборке
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Логирование метрики и параметров с помощью MLflow
    with mlflow.start_run(run_name="trial_" + str(trial.number)):
        mlflow.log_params(trial.params)
        mlflow.log_metric('accuracy', accuracy)
    
    return accuracy

In [13]:
# Подключение к БД PostgreSQL.
storage = RDBStorage(
    url='postgresql://postgres:mysecretpassword@localhost:5432/optuna',
    engine_kwargs={'pool_pre_ping': True}
)

# Set the database connection string for PostgreSQL.
db_uri = "postgresql://postgres:mysecretpassword@localhost:5432/mlflow"

# Configure the MLflow server to use a PostgreSQL backend store and artifact repository.
mlflow.set_tracking_uri(db_uri)
mlflow.set_registry_uri(db_uri)

# Create a new MLflow experiment.
experiment_name = "log_reg_experiment " + str(uuid.uuid4())
mlflow.set_experiment(experiment_name)

# Создание и запуск оптимизации с помощью Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Вывод результатов оптимизации
print('Best trial:')
best_trial = study.best_trial
print('  Value: ', best_trial.value)
print('  Params: ')
for key, value in best_trial.params.items():
    print(f'    {key}: {value}')

2023/10/23 11:14:37 INFO mlflow.tracking.fluent: Experiment with name 'log_reg_experiment 344dc467-6c99-494a-aaf8-d1109737aba1' does not exist. Creating a new experiment.
[I 2023-10-23 11:14:37,715] A new study created in memory with name: no-name-0d68c5fd-d5f5-4f36-9bcb-80be86a63339
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2023-10-23 11:14:42,601] Trial 0 finished with value: 0.8946469248291572 and parameters: {'C': 0.4734019660467526, 'max_iter': 583}. Best is trial 0 with value: 0.8946469248291572.
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2023-10-23 11:14:47,546] Trial 1 finished with value: 0.9060364464692483 and parameters: {'C': 3.7177764506733038, 'max_iter': 806}. Best is trial 1 with value: 0.9060364464692483.
  C = trial.suggest_loguniform('C', 0.01, 10.0)
[I 2023-10-23 11:14:51,496] Trial 2 finished with value: 0.8428246013667426 and parameters: {'C': 0.05305938692746762, 'max_iter': 231}. Best is trial 1 with value: 0.9060364464692483.
  C = trial.sugges

Best trial:
  Value:  0.9071753986332574
  Params: 
    C: 4.279696325492585
    max_iter: 388
