# Обучение модели

In [111]:
# %pip install hyperopt

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

import mlflow
import mlflow.sklearn

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [4]:
# Инициализация клиента
# print('Инициализация клиента...')
s3 = boto3.client('s3',
                  endpoint_url='http://localhost:9000',
                  aws_access_key_id='minio',
                  aws_secret_access_key='minio123')

In [114]:
# Считывание данных
# print('Считывание данных...')
obj = s3.get_object(Bucket='datasets', Key='kinopoisk_train.csv')
data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))

In [5]:
# df = pd.read_csv('kinopoisk_train.csv')

In [6]:
# Установка переменных окружения в Unix-подобных системах (Mac, Linux)
os.system('export MLFLOW_TRACKING_URI=http://localhost:5000')
os.system('export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000')

# Установка переменных окружения в Windows
os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [19]:
# Алгоритмы (пайплайны)
algs = dict()

algs['text_clf_v_logreg'] = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('logreg', LogisticRegression()),
])

# algs['text_clf_v_mnnb'] = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('mnnb', SGDClassifier(random_state=42)),
# ])

In [20]:
default_params = dict()

for model_name, alg in algs.items():
    default_params[model_name] = alg.get_params()

In [21]:
# Обучение без параметров (тест)
models = dict()

for model_name, alg in algs.items():
    print(f'Обучение {model_name}...')
    models[model_name] = alg.fit(X_train, y_train)

Обучение text_clf_v_logreg...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
# models[text_clf_v_mnnb] = models[text_clf_v_mnnb].fit(X_train, y_train)

In [23]:
# Точность без параметров
accuracy = dict()

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy[model_name] = accuracy_score(y_test, y_pred)
    print(f'Точность модели {model_name}:', accuracy[model_name])

Точность модели text_clf_v_logreg: 0.7327258921791951


In [24]:
# Распаковка словаря
def f_unpack_dict(dct):
    res = {}
    for (k, v) in dct.items():
        if isinstance(v, dict):
            res = {**res, **f_unpack_dict(v)}
        else:
            res[k] = v
            
    return res

In [25]:
# Создание функций для оптимизации
objectives = dict()

for model_name, model in models.items():
    def objective(params):
        params = f_unpack_dict(params)
        print(params)

        model = models[model_name]
        model.set_params(**params)
        model.fit(X_train, y_train)
        print(model.get_params())

        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        return {'loss': -accuracy, 'params': params, 'status': STATUS_OK}
    
    objectives[model_name] = objective

del(objective)

In [26]:
# Оптимизация параметров
spaces = dict()

spaces['text_clf_v_logreg'] = {
        'group_by__logreg__penalty': hp.choice('hyper_param_groups',
            [
                {
                    'logreg__penalty': hp.choice('penalty_block1', ['l2']),
                    'logreg__solver': hp.choice('solver_block1', ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']), # Исключил: ['newton-cholesky']
                    # 'logreg__multi_class':hp.choice('multi_class', ['ovr', 'multinomial']),
                },
                {
                    'logreg__penalty': hp.choice('penalty_block3', ['l1']),
                    'logreg__solver': hp.choice('solver_block3', ['liblinear', 'saga']),
                    # 'logreg__multi_class':hp.choice('multi_class_block3', ['ovr', 'multinomial']),
                },
                {
                    'logreg__penalty': hp.choice('penalty_block4', [None]),
                    'logreg__solver': hp.choice('solver_block4', ['lbfgs', 'newton-cg', 'sag', 'saga']), # Исключил: ['newton-cholesky']
                },
            ]),
        'logreg__class_weight': hp.choice('class_weight', ['balanced', None]),
        # 'logreg__max_iter': hp.choice('max_iter', [100,500]),
    }

# spaces['text_clf_v_mnnb'] = {
#     'mnnb__loss': hp.choice('loss', ['hinge', 'log_loss', 'modified_huber', 
#                                      'squared_hinge', 'perceptron', 'squared_error', 
#                                      'huber', 'epsilon_insensitive', 
#                                      'squared_epsilon_insensitive']),
#     'logreg__dual': hp.choice('dual', [True, False]),
#     'logreg__C': hp.loguniform(label='C', low=-4*np.log(10), high=2*np.log(10)),
#     'logreg__solver': hp.choice('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'])
# }

all_trails = dict()

for model_name, space in spaces.items():
    print(f'Поиск параметров для {model_name}...')
    all_trails[model_name] = Trials()
    best = fmin(
        fn=objectives[model_name], 
        space=spaces[model_name], 
        algo=tpe.suggest, 
        max_evals=40,
        trials=all_trails[model_name]
    )

Поиск параметров для text_clf_v_logreg...
{'logreg__penalty': 'l1', 'logreg__solver': 'saga', 'logreg__class_weight': 'balanced'}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(class_weight='balanced', penalty='l1', solver='saga'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(class_weight='balanced', penalty='l1', solver='saga'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_




{'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression())], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_intercept': True, 'logreg__inte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



{'logreg__penalty': 'l1', 'logreg__solver': 'saga', 'logreg__class_weight': 'balanced'}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(class_weight='balanced', penalty='l1', solver='saga'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(class_weight='balanced', penalty='l1', solver='saga'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logre




{'logreg__penalty': 'l2', 'logreg__solver': 'saga', 'logreg__class_weight': 'balanced'}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(class_weight='balanced', solver='saga'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(class_weight='balanced', solver='saga'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_w




{'logreg__penalty': 'l2', 'logreg__solver': 'newton-cg', 'logreg__class_weight': 'balanced'}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(class_weight='balanced', solver='newton-cg'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(class_weight='balanced', solver='newton-cg'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, '




{'logreg__penalty': 'l2', 'logreg__solver': 'liblinear', 'logreg__class_weight': 'balanced'}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(class_weight='balanced', solver='liblinear'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(class_weight='balanced', solver='liblinear'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, '




{'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression())], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_intercept': True, 'logreg__inte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



{'logreg__penalty': 'l2', 'logreg__solver': 'liblinear', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(solver='liblinear'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(solver='liblinear'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual': False, 'log




{'logreg__penalty': 'l2', 'logreg__solver': 'lbfgs', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression())], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_intercept': True, 'logreg__inte

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



{'logreg__penalty': None, 'logreg__solver': 'sag', 'logreg__class_weight': None}  
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty=None, solver='sag'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty=None, solver='sag'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual':




{'logreg__penalty': None, 'logreg__solver': 'sag', 'logreg__class_weight': None}  
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty=None, solver='sag'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty=None, solver='sag'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual':




{'logreg__penalty': None, 'logreg__solver': 'sag', 'logreg__class_weight': None}  
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty=None, solver='sag'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty=None, solver='sag'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual':




{'logreg__penalty': None, 'logreg__solver': 'sag', 'logreg__class_weight': None}  
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty=None, solver='sag'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty=None, solver='sag'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual':




{'logreg__penalty': None, 'logreg__solver': 'lbfgs', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty=None))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty=None), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None, 'logreg__dual': False, 'logreg__fit_interce




{'logreg__penalty': 'l1', 'logreg__solver': 'liblinear', 'logreg__class_weight': None}
{'memory': None, 'steps': [('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('logreg', LogisticRegression(penalty='l1', solver='liblinear'))], 'verbose': False, 'vect': CountVectorizer(), 'tfidf': TfidfTransformer(), 'logreg': LogisticRegression(penalty='l1', solver='liblinear'), 'vect__analyzer': 'word', 'vect__binary': False, 'vect__decode_error': 'strict', 'vect__dtype': <class 'numpy.int64'>, 'vect__encoding': 'utf-8', 'vect__input': 'content', 'vect__lowercase': True, 'vect__max_df': 1.0, 'vect__max_features': None, 'vect__min_df': 1, 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__stop_words': None, 'vect__strip_accents': None, 'vect__token_pattern': '(?u)\\b\\w\\w+\\b', 'vect__tokenizer': None, 'vect__vocabulary': None, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': True, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True, 'logreg__C': 1.0, 'logreg__class_weight': None,

In [27]:
best_model_name = min(all_trails.items(), key=lambda x: float(x[1].best_trial['result']['loss']))[0]
best_model_trails = all_trails[best_model_name]

print('Лучшая модель:', best_model_name)
print('Точность:', -best_model_trails.best_trial['result']['loss'])

Лучшая модель: text_clf_v_logreg
Точность: 0.7699316628701595


In [28]:
best_model_trails.best_trial

{'state': 2,
 'tid': 24,
 'spec': None,
 'result': {'loss': -0.7699316628701595,
  'params': {'logreg__penalty': None,
   'logreg__solver': 'lbfgs',
   'logreg__class_weight': None},
  'status': 'ok'},
 'misc': {'tid': 24,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'class_weight': [24],
   'hyper_param_groups': [24],
   'penalty_block1': [],
   'penalty_block3': [],
   'penalty_block4': [24],
   'solver_block1': [],
   'solver_block3': [],
   'solver_block4': [24]},
  'vals': {'class_weight': [1],
   'hyper_param_groups': [2],
   'penalty_block1': [],
   'penalty_block3': [],
   'penalty_block4': [0],
   'solver_block1': [],
   'solver_block3': [],
   'solver_block4': [0]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2023, 10, 23, 18, 1, 57, 760000),
 'refresh_time': datetime.datetime(2023, 10, 23, 18, 2, 35, 234000)}

In [29]:
# Создание баккита "mlflow"
try:
    s3.create_bucket(Bucket='mlflow')
    print('Баккит "mlflow" создан')
except s3.exceptions.BucketAlreadyOwnedByYou:
    print('Баккит "mlflow" уже существует')

Баккит "mlflow" уже существует


In [30]:
# Настройка клиента boto3
print('Настройка клиента boto3...')
boto3.setup_default_session(
    aws_access_key_id='minio',
    aws_secret_access_key='minio123',
    region_name='us-west-1'  # или другой регион, если это применимо
)

Настройка клиента boto3...


In [31]:
description = 'Pipeline(steps=' + str([type(s[1]).__name__ for s in models[best_model_name].steps]) + ')'
print(description)

Pipeline(steps=['CountVectorizer', 'TfidfTransformer', 'LogisticRegression'])


In [32]:
print('Логирование в MLflow...')
description = 'Pipeline(steps=' + str([type(s[1]).__name__ for s in models[best_model_name].steps]) + ')'
with mlflow.start_run() as run:
    # Логирование параметров и метрик
    mlflow.log_param("model_type", description)
    mlflow.log_metric("accuracy", -best_model_trails.best_trial['result']['loss'])
    
    # Логирование модели
    mlflow.sklearn.log_model(models[best_model_name], "model", registered_model_name="MyOptimizedModel")

Логирование в MLflow...


Registered model 'MyOptimizedModel' already exists. Creating a new version of this model...
2023/10/23 21:20:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: MyOptimizedModel, version 2
Created version '2' of model 'MyOptimizedModel'.


# Запуск Mlflow

In [33]:
import os

# Установка переменных окружения в Unix-подобных системах (Mac, Linux)
os.system('export MLFLOW_TRACKING_URI=http://localhost:5000')
os.system('export MLFLOW_S3_ENDPOINT_URL=http://localhost:9000')

# Установка переменных окружения в Windows
os.environ['MLFLOW_TRACKING_URI'] = 'http://localhost:5000'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

os.system('mlflow run . --experiment-name=kinopoisk')

2023/10/23 21:20:54 INFO mlflow.utils.conda: === Creating conda environment mlflow-0b5efcd68adffd1ec96421414cd352b3494f6f6b ===


Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed



ResolvePackageNotFound: 
  - hyperopt

Traceback (most recent call last):
  File "/root/anaconda3/bin/mlflow", line 8, in <module>
    sys.exit(cli())
             ^^^^^
  File "/root/anaconda3/lib/python3.11/site-packages/click/core.py", line 1128, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/lib/python3.11/site-packages/click/core.py", line 1053, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "/root/anaconda3/lib/python3.11/site-packages/click/core.py", line 1659, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/lib/python3.11/site-packages/click/core.py", line 1395, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/anaconda3/lib/python3.11/site-packages/click/core.py", line 754, in invoke
    return __callback(*args, **

256