# Описание проекта.


***
Новый сервис предоставляет пользователям возможность редактировать и дополнять описания товаров в интернет-магазине, аналогично вики-сообществам. Этот инструмент позволяет клиентам предлагать правки и комментировать изменения других пользователей. Однако, для обеспечения безопасности и качества контента, магазин нуждается в инструменте, способном автоматически выявлять токсичные комментарии и отправлять их на модерацию.
***

**Цель проекта** — обучить модель, способную классифицировать комментарии на позитивные и негативные. В распоряжении имеется набор данных с разметкой о токсичности правок. Задача заключается в построении модели с уровнем метрики качества F1 не менее 0.75.






!['Схема пайплайна'](https://raw.githubusercontent.com/htoniy/DS_NLP/main/01_Negative_comments_identifying/Project%20Algorithm.png)

# Импорт данных.

**Установка библиотек, необходимых для корректной равботы кода.**

Для работы ячейки необходимо ее перевести в формат кода.

!pip install nltk
!pip install spacy
!pip install imblearn
!pip install catboost
!pip install lightgbm
!pip install transformers

**Импорт библиотек.**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords

import spacy
from spacy.lang.en.examples import sentences 

import warnings
import seaborn as sns
from tqdm import notebook
from tqdm import tqdm
from pprint import pprint
import time
import re

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import SMOTENC

from catboost import Pool, CatBoostClassifier
from catboost.text_processing import Tokenizer

from lightgbm import LGBMClassifier
import lightgbm as lgb

import transformers 

In [2]:
from transformers import BertTokenizer

In [3]:
sns.set_style('darkgrid')
nltk.download('punkt')
nltk.download('wordnet')
warnings.filterwarnings('ignore', category=RuntimeWarning)
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/maksimgorskov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/maksimgorskov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksimgorskov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
twit_df = pd.read_csv('./toxic_comments.csv', index_col = 0)

In [5]:
twit_df.head(5)

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


# Предобработка данных.

## Общий анализ.

In [6]:
twit_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159292 entries, 0 to 159450
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159292 non-null  object
 1   toxic   159292 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.6+ MB


In [7]:
#Анализ уникальных значений целевого признака.
twit_df['toxic'].unique()

array([0, 1])

In [8]:
#Проверка на наличие дубликатов.
if twit_df.duplicated().sum() > 0:
    print('В данных присутствуют дубликаты. Необходима проверка.')
else:
    print('Дубликаты отсутствуют.')

Дубликаты отсутствуют.


In [9]:
#Для удобства текст переводится в нижний регистр.
twit_df['text'] = twit_df['text'].str.lower()

## Леммитизация текста.

В данном разделе проводится очистка текста от стоп слов и лишних символов. Помимо этого, слова текстов приводятся к леммам.

In [10]:
#Функция для чистки текста от лишних символов.
def text_preprocessing(text):
    tokenized = nltk.word_tokenize(text)
    joined = ' '.join(tokenized)
    text_only = re.sub(r"[^a-z0-9!@#\$%\^\&\*_\-,\.' ]", ' ', joined)
    final = ' '.join(text_only.split())
    return final

In [11]:
#Применение функции очистки к фрему данных.
tqdm.pandas()
twit_df['lemm_text'] = twit_df['text'].progress_apply(text_preprocessing)

100%|█████████████████████████████████| 159292/159292 [00:48<00:00, 3291.85it/s]


In [12]:
twit_df.head(3)

Unnamed: 0,text,toxic,lemm_text
0,explanation\nwhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,d'aww! he matches this background colour i'm s...,0,d'aww ! he matches this background colour i 'm...
2,"hey man, i'm really not trying to edit war. it...",0,"hey man , i 'm really not trying to edit war ...."


In [13]:
#Инициализация модели NLP
nlp = spacy.load('en_core_web_sm')
#, exclude=['parser', 'attribute_ruler', 'ner']

In [14]:
#Функция для лемматизации текстов.
def lemmatize(text):
    doc = nlp(text)
    words = []
    for token in doc:
        if (token.is_stop != True) and (token.is_punct != True) and\
            (token.is_space != True) and (token.is_digit != True):
            words.append(token.lemma_)
    return ' '.join(words)

In [15]:
tqdm.pandas()
twit_df['lemm_text'] = twit_df['lemm_text'].progress_apply(lemmatize)

100%|██████████████████████████████████| 159292/159292 [25:15<00:00, 105.08it/s]


In [16]:
twit_df.head(3)

Unnamed: 0,text,toxic,lemm_text
0,explanation\nwhy the edits made under my usern...,0,explanation edit username hardcore metallica f...
1,d'aww! he matches this background colour i'm s...,0,d'aww match background colour m seemingly stuc...
2,"hey man, i'm really not trying to edit war. it...",0,hey man m try edit war guy constantly remove r...


Текст Лемматизирован, лишние символы удалены.

## Вывод:
 - Пропуски в наборе данных отсутствуют.
 - Дубликаты в данных отсутствуют.
 - Текстовые данные приведены к нижнему регистру.
 - В числовых параметрах все значения соответствуют ожидаемым.
 - Дальнейшая предобработка данных не нужна
 - Лемматизированный текст добавлен в датасет.

# Обучение моделей классификации.

**План обучения:**
- Подготовка признаков.
- Обучение ансамбля Логистической регрессии, Случайного леса и SGD моделей c upnsampling.
- Обучение ансамбля Логистической регрессии, Случайного леса и SGD моделей без upsampling.
- Обучение модели SGD без downsampling c GS и Pipeline.
- Ансамбль на базе Случайного леса, Особо случайного леса, Логистической регресии и MLPClassifier.
- Стэккинг на базе случайного леса.
- Обучение модели BERT (DistilBertModel).

Для вы бора наилучшей модели, будут использоваться F1 мера, а также скорость обучения модели.

In [17]:
#Датафрейм для сравнения результатов различных моделей определения токсичных комментариев.
data_analis = {'ML model' : [],
              'F1_score' : [],
              'Learning time':[]}
result_data = pd.DataFrame(data_analis)
result_data

Unnamed: 0,ML model,F1_score,Learning time


## Подготовка признаков.

### Разбиение на тренировочную и тестовую выборки.

Определение целевого признака и обучающий текст.

In [18]:
X = twit_df.drop(['toxic', 'text'], axis = 1)
y = twit_df['toxic']

Разбиение на тестовую, валидационную и тренировочную выборки. Размер тестовой выборки = 10%, размер валидационной выборки - 10%.

In [19]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.2, random_state = 12345)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 12345)

In [20]:
X_train.shape

(127433, 1)

### Анализ дисбаланса классов.


In [21]:
twit_df['toxic'].value_counts(normalize=True).to_frame()

Unnamed: 0,toxic
0,0.898388
1,0.101612


Наблюдается сильный дисбаланс классов. Токсичных комментариев в датасете наблюдается всего лишь 10% от общего количества.

Далее в коде создается функция для устранения дисбаланся с помощью библиотеки **imblearn**. Применение функции позволит исключить влияние дисбаланса классов при обучении моделей.

In [22]:
#X - признаки, y - целевая переменная.
def ros_data(X, y):
    ros = RandomOverSampler()
    X_res, y_res =  ros.fit_resample(X, y)
    return X_res, y_res

Устранение дисбаланса в обучающей выборке.

In [23]:
X_res, y_res = ros_data(X_train, y_train)

###  Добавление TF-IDF.

Добавление признаков в тренировочную выборку.

In [24]:
corpus = X_res['lemm_text'].values

In [25]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords) 
tf_idf_train_res = count_tf_idf.fit_transform(corpus) 
print(f'Размер матрицы для обучения: {tf_idf_train_res.shape}')

Размер матрицы для обучения: (228954, 143603)


Добавление признаков в валидационную выборку.

In [26]:
corpus_valid = X_valid['lemm_text'].values
# Преобразование тестовых данных в матрицу TF-IDF с использованием уже обученного векторизатора
tf_idf_valid_res = count_tf_idf.transform(corpus_valid) 
print(f'Размер матрицы для тестирования: {tf_idf_valid_res.shape}')

Размер матрицы для тестирования: (15929, 143603)


### Вывод.

В ходе подготовки признаков выполнены следующие шаги:
- Произведена разбивка на тестовую и обучающую выборки. 
- Сформированы дополнительыне признаки с помощью TF-IDF.
- Устранен дисбаланс классов.


## Обучение моделей классификации.

В качестве базисной модели принимается ансамбль моделей Логистической регрессии, Случайного леса и SGD. Также на жтой базисной модели проверяется влияние устранения дисбаланса в классах методом RandomOverSampler.


### Ансамбль Логистической регрессии, Случайного леса и SGD моделей c RandomOverSampling.

In [27]:
X_train_ans = tf_idf_train_res
y_train_ans = y_res

**Описание ансамбля моделей.**

Для обучения ансамбля моделей выбирается 3 модели классификации (LogisticRegression, RandomForestClassifier, SGDClassifier).

Используется ансамбль моделей с применением метода голосования. estimators - это список пар (name, estimator), где name - это имя модели, а estimator - сама модель. Параметр voting='soft' означает, что используется мягкое голосование, где каждая модель вносит свой вклад в виде вероятностей.

In [28]:
%%time
start_time = time.time()

log_clf = LogisticRegression(solver="liblinear", random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state = 42)
sgb_clf = SGDClassifier(max_iter = 1000, random_state = 42, loss='modified_huber')

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('sgb', sgb_clf)],
    voting='soft')

X_valid_ans = tf_idf_valid_res

voting_clf.fit(X_train_ans, y_train_ans)
predict = voting_clf.predict(X_valid_ans)
f_score = f1_score(predict, y_valid)

end_time = time.time()
model_1_time = round(end_time - start_time, 0)
print('{}'.format(f_score))

0.7865982493208573
CPU times: user 55.7 s, sys: 2.9 s, total: 58.6 s
Wall time: 46.2 s


**Внесение результатов модели в сводную таблицу.**

In [29]:
new_result = {'ML model' : 'Ансамбль Логистической регрессии, Случайного леса и SGD моделей c RandomOverSampling.',
              'F1_score' : f_score,
              'Learning time':model_1_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0


### Ансамбль Логистической регрессии, Случайного леса и SGD моделей без RandomOverSampling.

In [30]:
corpus = X_train['lemm_text'].values
count_tf_idf = TfidfVectorizer(stop_words=stopwords) 
tf_idf_train = count_tf_idf.fit_transform(corpus) 
print(f'Размер матрицы для обучения: {tf_idf_train.shape}')

Размер матрицы для обучения: (127433, 143603)


In [31]:
X_train_ans = tf_idf_train
y_train_ans = y_train

In [32]:
corpus_test = X_valid['lemm_text'].values
# Преобразование тестовых данных в матрицу TF-IDF с использованием уже обученного векторизатора
tf_idf_valid = count_tf_idf.transform(corpus_test) 
print(f'Размер матрицы для тестирования: {tf_idf_valid.shape}')

Размер матрицы для тестирования: (15929, 143603)


In [33]:
%%time
start_time = time.time()

log_clf = LogisticRegression(solver="liblinear", random_state = 42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state = 42)
sgb_clf = SGDClassifier(max_iter = 1000, random_state = 42, loss='modified_huber')

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('sgb', sgb_clf)],
    voting='soft')

X_valid_ans = tf_idf_valid

voting_clf.fit(X_train_ans, y_train_ans)
predict = voting_clf.predict(X_valid_ans)
f_score = f1_score(predict, y_valid)

end_time = time.time()
model_2_time = round(end_time - start_time, 0)
print('{}'.format(f_score))

0.7533632286995514
CPU times: user 28.6 s, sys: 2.01 s, total: 30.6 s
Wall time: 23.1 s


In [34]:
new_result = {'ML model' : 'Ансамбль Логистической регрессии, Случайного леса и SGD моделей без RandomOverSampling.',
              'F1_score' : f_score,
              'Learning time':model_2_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0
1,"Ансамбль Логистической регрессии, Случайного л...",0.753363,23.0


**Вывод.**

Устранение дисбаланса классов улучшает метрику на 5%. В последующих моделях будут использованы данные с устраненным дисбалансом классов.

### LogsticalRegressor

In [35]:
X_train_ans = tf_idf_train_res
y_train_ans = y_res
X_valid_ans = tf_idf_valid_res

In [36]:
%%time
start_time = time.time()

# Инициализация модели
lin_reg = LogisticRegression(solver = 'liblinear', max_iter =100, random_state=42, C = 15)

# Обучение на тренировочных данных
lin_reg.fit(X_train_ans, y_train_ans)

#Предсказание на валидационной выборке
predict = lin_reg.predict(X_valid_ans)

f_score = f1_score(predict, y_valid)
f_score

end_time = time.time()
model_3_time = round(end_time - start_time, 0)

print('{}'.format(f_score))

0.7616519174041296
CPU times: user 16.1 s, sys: 4.41 s, total: 20.5 s
Wall time: 2.19 s


In [37]:
new_result = {'ML model' : 'logisticalRegressor',
              'F1_score' : f_score,
              'Learning time':model_3_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0
1,"Ансамбль Логистической регрессии, Случайного л...",0.753363,23.0
2,logisticalRegressor,0.761652,2.0


### CatBoostClassifier.

**Примечания:**

Подбор гиперпараметров с помощью GreedSearch занял более 11 часов, поэтому для оптимизации работы тетрадки при ее перезапуске были оставлены только подобранные гиперпараметры.

In [51]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        iterations=2500,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=500,
        l2_leaf_reg = 2,
        border_count = 32,
        depth = 5,
        random_state=42,
        boosting_type='Plain',
        **kwargs
    )
    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=500,
        plot=True,
        use_best_model=True
    )

In [47]:
#Создание объекта Pool для обучения
train_pool = Pool(
    data=X_res,
    label=y_res,
    text_features=[0]  
)

In [48]:
#Создание объекта Pool для теста
valid_pool = Pool(
    data=X_valid,
    label=y_valid,
    text_features=[0] 
)

In [52]:
%%time
start_time = time.time()

model_cbc = fit_model(
    train_pool, valid_pool,
    learning_rate=0.1,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }      
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'max_dictionary_size': '50000'
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=10000'
    ]
)

end_time = time.time()
model_4_time = round(end_time - start_time, 0)

print('{}'.format(f_score))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6597745	test: 0.9306297	best: 0.9306297 (0)	total: 192ms	remaining: 8m
500:	learn: 0.9110345	test: 0.9429343	best: 0.9429343 (497)	total: 1m 31s	remaining: 6m 6s
1000:	learn: 0.9363060	test: 0.9455082	best: 0.9455082 (986)	total: 3m 4s	remaining: 4m 36s
1500:	learn: 0.9529294	test: 0.9476427	best: 0.9477054 (1496)	total: 4m 37s	remaining: 3m 4s
2000:	learn: 0.9632983	test: 0.9492749	best: 0.9494005 (1958)	total: 6m 11s	remaining: 1m 32s
2499:	learn: 0.9686793	test: 0.9505933	best: 0.9507188 (2341)	total: 7m 45s	remaining: 0us

bestTest = 0.9507188147
bestIteration = 2341

Shrink model to first 2342 iterations.
0.7893192488262911
CPU times: user 54min 10s, sys: 2min 52s, total: 57min 2s
Wall time: 7min 49s


In [53]:
#Test data predictions
predicted_valid = model.predict(X_valid)
f_score = f1_score(predicted_valid, y_valid)
f_score

0.7820250948351328

In [54]:
new_result = {'ML model' : 'CatBoostClassifier.',
              'F1_score' : f_score,
              'Learning time':model_4_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0
1,"Ансамбль Логистической регрессии, Случайного л...",0.753363,23.0
2,logisticalRegressor,0.761652,2.0
3,CatBoostClassifier.,0.782025,470.0


### Light GBM.

Модель представляет собой классификатор LGBMClassifier с подобранными гиперпараметрами.


In [55]:
%%time
start_time = time.time()

# Инициализация модели
lgb_model = lgb.LGBMClassifier(learning_rate = 0.1, 
                               n_estimators = 1000, 
                               max_depth = 7, 
                               num_leaves = 62,
                               random_state = 42)



# Обучение на тренировочных данных
lgb_model.fit(X_train_ans, y_train_ans, verbose = 0)

#Предсказание на тестовых данных
predict = lgb_model.predict(X_valid_ans)

f_score = f1_score(predict, y_valid)
f_score

end_time = time.time()
model_5_time = round(end_time - start_time, 0)

print('{}'.format(f_score))

0.7797513321492007
CPU times: user 4min 18s, sys: 34.2 s, total: 4min 52s
Wall time: 31.6 s


Добавление результатов в сводную таблицу.

In [56]:
new_result = {'ML model' : 'LGBMClassifier.',
              'F1_score' : f_score,
              'Learning time':model_5_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0
1,"Ансамбль Логистической регрессии, Случайного л...",0.753363,23.0
2,logisticalRegressor,0.761652,2.0
3,CatBoostClassifier.,0.782025,470.0
4,LGBMClassifier.,0.779751,32.0


### Ансамбль LinearRegressor, LightGBMClassifier, CatBoostClassifier.

Исходя из сводной таблицы метрика F1 у моделей CatBoost, LightGBM и LinearRegressor приблизительно одинаковая. Ниже эти модели объединяются в ансамбль с мягким голосованием.

In [58]:
%%time
start_time = time.time()

model_cbc = fit_model(
    train_pool, valid_pool,
    learning_rate=0.1,
    tokenizers=[
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
            'lowercasing': 'True',
            'token_types':['Word', 'Number', 'SentenceBreak'],
            'sub_tokens_policy':'SeveralTokens'
        }      
    ],
    dictionaries = [
        {
            'dictionary_id': 'Word',
            'max_dictionary_size': '50000'
        }
    ],
    feature_calcers = [
        'BoW:top_tokens_count=10000'
    ]
)

lgbm_clf = lgb.LGBMClassifier(learning_rate = 0.1, 
                               n_estimators = 1000, 
                               max_depth = 7, 
                               num_leaves = 62,
                               random_state = 42,
                                verbosity=-1
)
lin_reg = LogisticRegression(solver = 'liblinear', 
                             max_iter =100, 
                             random_state=42, C = 15
)

voting_clf = VotingClassifier(
    estimators=[('cb', model_cbc), ('lgbm', lgbm_clf), ('lin', lin_reg)],
    voting='soft')

X_valid_ans = tf_idf_valid_res

voting_clf.fit(X_train_ans, y_train_ans)
predict = voting_clf.predict(X_valid_ans)
f_score = f1_score(predict, y_valid)

end_time = time.time()
model_6_time = round(end_time - start_time, 0)
print('{}'.format(f_score))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6597745	test: 0.9306297	best: 0.9306297 (0)	total: 190ms	remaining: 7m 55s
500:	learn: 0.9110345	test: 0.9429343	best: 0.9429343 (497)	total: 1m 31s	remaining: 6m 6s
1000:	learn: 0.9363060	test: 0.9455082	best: 0.9455082 (986)	total: 3m 4s	remaining: 4m 35s
1500:	learn: 0.9529294	test: 0.9476427	best: 0.9477054 (1496)	total: 4m 37s	remaining: 3m 4s
2000:	learn: 0.9632983	test: 0.9492749	best: 0.9494005 (1958)	total: 6m 10s	remaining: 1m 32s
2499:	learn: 0.9686793	test: 0.9505933	best: 0.9507188 (2341)	total: 7m 45s	remaining: 0us

bestTest = 0.9507188147
bestIteration = 2341

Shrink model to first 2342 iterations.
0:	learn: 0.6399888	total: 302ms	remaining: 12m 33s
1:	learn: 0.6786341	total: 565ms	remaining: 11m 45s
2:	learn: 0.7057706	total: 817ms	remaining: 11m 19s
3:	learn: 0.7073517	total: 1.07s	remaining: 11m 6s
4:	learn: 0.7091905	total: 1.3s	remaining: 10m 47s
5:	learn: 0.7174105	total: 1.51s	remaining: 10m 28s
6:	learn: 0.7328939	total: 1.73s	remaining: 10m 14s
7:	l

146:	learn: 0.8741145	total: 33s	remaining: 8m 48s
147:	learn: 0.8741669	total: 33.2s	remaining: 8m 47s
148:	learn: 0.8744377	total: 33.5s	remaining: 8m 47s
149:	learn: 0.8746560	total: 33.7s	remaining: 8m 47s
150:	learn: 0.8752894	total: 33.9s	remaining: 8m 47s
151:	learn: 0.8755864	total: 34.1s	remaining: 8m 47s
152:	learn: 0.8757480	total: 34.3s	remaining: 8m 46s
153:	learn: 0.8756300	total: 34.6s	remaining: 8m 46s
154:	learn: 0.8765123	total: 34.8s	remaining: 8m 46s
155:	learn: 0.8769666	total: 35s	remaining: 8m 46s
156:	learn: 0.8771151	total: 35.2s	remaining: 8m 45s
157:	learn: 0.8775300	total: 35.4s	remaining: 8m 45s
158:	learn: 0.8773203	total: 35.7s	remaining: 8m 45s
159:	learn: 0.8776042	total: 35.9s	remaining: 8m 44s
160:	learn: 0.8777877	total: 36.1s	remaining: 8m 44s
161:	learn: 0.8783904	total: 36.3s	remaining: 8m 44s
162:	learn: 0.8787136	total: 36.6s	remaining: 8m 44s
163:	learn: 0.8790237	total: 36.8s	remaining: 8m 43s
164:	learn: 0.8792465	total: 37s	remaining: 8m 43s

302:	learn: 0.9008709	total: 1m 7s	remaining: 8m 7s
303:	learn: 0.9012029	total: 1m 7s	remaining: 8m 7s
304:	learn: 0.9013164	total: 1m 7s	remaining: 8m 7s
305:	learn: 0.9014562	total: 1m 7s	remaining: 8m 6s
306:	learn: 0.9016309	total: 1m 8s	remaining: 8m 6s
307:	learn: 0.9017707	total: 1m 8s	remaining: 8m 6s
308:	learn: 0.9018536	total: 1m 8s	remaining: 8m 6s
309:	learn: 0.9017794	total: 1m 8s	remaining: 8m 5s
310:	learn: 0.9019847	total: 1m 9s	remaining: 8m 5s
311:	learn: 0.9020153	total: 1m 9s	remaining: 8m 5s
312:	learn: 0.9020982	total: 1m 9s	remaining: 8m 5s
313:	learn: 0.9022424	total: 1m 9s	remaining: 8m 4s
314:	learn: 0.9025263	total: 1m 9s	remaining: 8m 4s
315:	learn: 0.9024782	total: 1m 10s	remaining: 8m 4s
316:	learn: 0.9023952	total: 1m 10s	remaining: 8m 4s
317:	learn: 0.9026049	total: 1m 10s	remaining: 8m 3s
318:	learn: 0.9025350	total: 1m 10s	remaining: 8m 3s
319:	learn: 0.9027359	total: 1m 10s	remaining: 8m 3s
320:	learn: 0.9028975	total: 1m 11s	remaining: 8m 3s
321:	l

455:	learn: 0.9174812	total: 1m 40s	remaining: 7m 30s
456:	learn: 0.9176297	total: 1m 40s	remaining: 7m 30s
457:	learn: 0.9176734	total: 1m 41s	remaining: 7m 30s
458:	learn: 0.9177389	total: 1m 41s	remaining: 7m 30s
459:	learn: 0.9178044	total: 1m 41s	remaining: 7m 29s
460:	learn: 0.9176996	total: 1m 41s	remaining: 7m 29s
461:	learn: 0.9176166	total: 1m 41s	remaining: 7m 29s
462:	learn: 0.9175992	total: 1m 42s	remaining: 7m 29s
463:	learn: 0.9178044	total: 1m 42s	remaining: 7m 28s
464:	learn: 0.9179442	total: 1m 42s	remaining: 7m 28s
465:	learn: 0.9181451	total: 1m 42s	remaining: 7m 28s
466:	learn: 0.9182762	total: 1m 42s	remaining: 7m 28s
467:	learn: 0.9184727	total: 1m 43s	remaining: 7m 28s
468:	learn: 0.9184683	total: 1m 43s	remaining: 7m 27s
469:	learn: 0.9185076	total: 1m 43s	remaining: 7m 27s
470:	learn: 0.9186387	total: 1m 43s	remaining: 7m 27s
471:	learn: 0.9186125	total: 1m 44s	remaining: 7m 27s
472:	learn: 0.9186256	total: 1m 44s	remaining: 7m 26s
473:	learn: 0.9185994	total:

609:	learn: 0.9286057	total: 2m 14s	remaining: 6m 55s
610:	learn: 0.9287630	total: 2m 14s	remaining: 6m 55s
611:	learn: 0.9288416	total: 2m 14s	remaining: 6m 55s
612:	learn: 0.9288722	total: 2m 14s	remaining: 6m 54s
613:	learn: 0.9289595	total: 2m 14s	remaining: 6m 54s
614:	learn: 0.9289508	total: 2m 15s	remaining: 6m 54s
615:	learn: 0.9289071	total: 2m 15s	remaining: 6m 54s
616:	learn: 0.9290076	total: 2m 15s	remaining: 6m 53s
617:	learn: 0.9291211	total: 2m 15s	remaining: 6m 53s
618:	learn: 0.9291168	total: 2m 16s	remaining: 6m 53s
619:	learn: 0.9291954	total: 2m 16s	remaining: 6m 53s
620:	learn: 0.9292303	total: 2m 16s	remaining: 6m 53s
621:	learn: 0.9292653	total: 2m 16s	remaining: 6m 52s
622:	learn: 0.9294618	total: 2m 16s	remaining: 6m 52s
623:	learn: 0.9295492	total: 2m 17s	remaining: 6m 52s
624:	learn: 0.9296147	total: 2m 17s	remaining: 6m 52s
625:	learn: 0.9298243	total: 2m 17s	remaining: 6m 52s
626:	learn: 0.9300296	total: 2m 17s	remaining: 6m 51s
627:	learn: 0.9299073	total:

761:	learn: 0.9394507	total: 2m 46s	remaining: 6m 20s
762:	learn: 0.9396473	total: 2m 47s	remaining: 6m 20s
763:	learn: 0.9399224	total: 2m 47s	remaining: 6m 20s
764:	learn: 0.9399617	total: 2m 47s	remaining: 6m 20s
765:	learn: 0.9398875	total: 2m 47s	remaining: 6m 19s
766:	learn: 0.9399224	total: 2m 48s	remaining: 6m 19s
767:	learn: 0.9399923	total: 2m 48s	remaining: 6m 19s
768:	learn: 0.9400404	total: 2m 48s	remaining: 6m 19s
769:	learn: 0.9401583	total: 2m 48s	remaining: 6m 19s
770:	learn: 0.9402631	total: 2m 48s	remaining: 6m 18s
771:	learn: 0.9401801	total: 2m 49s	remaining: 6m 18s
772:	learn: 0.9402849	total: 2m 49s	remaining: 6m 18s
773:	learn: 0.9402456	total: 2m 49s	remaining: 6m 18s
774:	learn: 0.9402413	total: 2m 49s	remaining: 6m 17s
775:	learn: 0.9403548	total: 2m 50s	remaining: 6m 17s
776:	learn: 0.9402500	total: 2m 50s	remaining: 6m 17s
777:	learn: 0.9402675	total: 2m 50s	remaining: 6m 17s
778:	learn: 0.9404072	total: 2m 50s	remaining: 6m 17s
779:	learn: 0.9404466	total:

915:	learn: 0.9482560	total: 3m 20s	remaining: 5m 46s
916:	learn: 0.9483608	total: 3m 20s	remaining: 5m 45s
917:	learn: 0.9483433	total: 3m 20s	remaining: 5m 45s
918:	learn: 0.9485006	total: 3m 20s	remaining: 5m 45s
919:	learn: 0.9486010	total: 3m 21s	remaining: 5m 45s
920:	learn: 0.9485617	total: 3m 21s	remaining: 5m 45s
921:	learn: 0.9485792	total: 3m 21s	remaining: 5m 44s
922:	learn: 0.9486534	total: 3m 21s	remaining: 5m 44s
923:	learn: 0.9486622	total: 3m 21s	remaining: 5m 44s
924:	learn: 0.9487670	total: 3m 22s	remaining: 5m 44s
925:	learn: 0.9489068	total: 3m 22s	remaining: 5m 43s
926:	learn: 0.9489635	total: 3m 22s	remaining: 5m 43s
927:	learn: 0.9489635	total: 3m 22s	remaining: 5m 43s
928:	learn: 0.9490291	total: 3m 22s	remaining: 5m 43s
929:	learn: 0.9491033	total: 3m 23s	remaining: 5m 43s
930:	learn: 0.9489723	total: 3m 23s	remaining: 5m 42s
931:	learn: 0.9490727	total: 3m 23s	remaining: 5m 42s
932:	learn: 0.9491514	total: 3m 23s	remaining: 5m 42s
933:	learn: 0.9491645	total:

1066:	learn: 0.9560174	total: 3m 52s	remaining: 5m 12s
1067:	learn: 0.9560960	total: 3m 52s	remaining: 5m 12s
1068:	learn: 0.9561309	total: 3m 53s	remaining: 5m 12s
1069:	learn: 0.9561571	total: 3m 53s	remaining: 5m 11s
1070:	learn: 0.9561659	total: 3m 53s	remaining: 5m 11s
1071:	learn: 0.9561964	total: 3m 53s	remaining: 5m 11s
1072:	learn: 0.9562838	total: 3m 54s	remaining: 5m 11s
1073:	learn: 0.9562838	total: 3m 54s	remaining: 5m 11s
1074:	learn: 0.9563493	total: 3m 54s	remaining: 5m 10s
1075:	learn: 0.9563493	total: 3m 54s	remaining: 5m 10s
1076:	learn: 0.9563318	total: 3m 54s	remaining: 5m 10s
1077:	learn: 0.9563974	total: 3m 55s	remaining: 5m 10s
1078:	learn: 0.9564585	total: 3m 55s	remaining: 5m 9s
1079:	learn: 0.9565109	total: 3m 55s	remaining: 5m 9s
1080:	learn: 0.9565590	total: 3m 55s	remaining: 5m 9s
1081:	learn: 0.9565153	total: 3m 55s	remaining: 5m 9s
1082:	learn: 0.9565939	total: 3m 56s	remaining: 5m 9s
1083:	learn: 0.9565633	total: 3m 56s	remaining: 5m 8s
1084:	learn: 0.9

1218:	learn: 0.9626737	total: 4m 25s	remaining: 4m 39s
1219:	learn: 0.9627087	total: 4m 26s	remaining: 4m 39s
1220:	learn: 0.9627698	total: 4m 26s	remaining: 4m 39s
1221:	learn: 0.9627917	total: 4m 26s	remaining: 4m 38s
1222:	learn: 0.9628965	total: 4m 26s	remaining: 4m 38s
1223:	learn: 0.9629664	total: 4m 27s	remaining: 4m 38s
1224:	learn: 0.9628572	total: 4m 27s	remaining: 4m 38s
1225:	learn: 0.9628965	total: 4m 27s	remaining: 4m 37s
1226:	learn: 0.9628572	total: 4m 27s	remaining: 4m 37s
1227:	learn: 0.9630013	total: 4m 27s	remaining: 4m 37s
1228:	learn: 0.9630013	total: 4m 28s	remaining: 4m 37s
1229:	learn: 0.9630188	total: 4m 28s	remaining: 4m 37s
1230:	learn: 0.9630013	total: 4m 28s	remaining: 4m 36s
1231:	learn: 0.9629707	total: 4m 28s	remaining: 4m 36s
1232:	learn: 0.9629751	total: 4m 28s	remaining: 4m 36s
1233:	learn: 0.9630450	total: 4m 29s	remaining: 4m 36s
1234:	learn: 0.9630887	total: 4m 29s	remaining: 4m 35s
1235:	learn: 0.9631018	total: 4m 29s	remaining: 4m 35s
1236:	lear

1368:	learn: 0.9684216	total: 4m 58s	remaining: 4m 6s
1369:	learn: 0.9684216	total: 4m 58s	remaining: 4m 6s
1370:	learn: 0.9685046	total: 4m 58s	remaining: 4m 6s
1371:	learn: 0.9685046	total: 4m 58s	remaining: 4m 5s
1372:	learn: 0.9686487	total: 4m 59s	remaining: 4m 5s
1373:	learn: 0.9687273	total: 4m 59s	remaining: 4m 5s
1374:	learn: 0.9686662	total: 4m 59s	remaining: 4m 5s
1375:	learn: 0.9686618	total: 4m 59s	remaining: 4m 4s
1376:	learn: 0.9687404	total: 5m	remaining: 4m 4s
1377:	learn: 0.9687535	total: 5m	remaining: 4m 4s
1378:	learn: 0.9688103	total: 5m	remaining: 4m 4s
1379:	learn: 0.9689195	total: 5m	remaining: 4m 4s
1380:	learn: 0.9689632	total: 5m	remaining: 4m 3s
1381:	learn: 0.9690768	total: 5m 1s	remaining: 4m 3s
1382:	learn: 0.9690855	total: 5m 1s	remaining: 4m 3s
1383:	learn: 0.9690899	total: 5m 1s	remaining: 4m 3s
1384:	learn: 0.9691161	total: 5m 1s	remaining: 4m 2s
1385:	learn: 0.9690287	total: 5m 2s	remaining: 4m 2s
1386:	learn: 0.9690287	total: 5m 2s	remaining: 4m 2s


1519:	learn: 0.9727718	total: 5m 31s	remaining: 3m 33s
1520:	learn: 0.9727980	total: 5m 31s	remaining: 3m 33s
1521:	learn: 0.9727849	total: 5m 31s	remaining: 3m 32s
1522:	learn: 0.9728592	total: 5m 31s	remaining: 3m 32s
1523:	learn: 0.9729334	total: 5m 31s	remaining: 3m 32s
1524:	learn: 0.9729334	total: 5m 32s	remaining: 3m 32s
1525:	learn: 0.9729684	total: 5m 32s	remaining: 3m 32s
1526:	learn: 0.9730732	total: 5m 32s	remaining: 3m 31s
1527:	learn: 0.9731038	total: 5m 32s	remaining: 3m 31s
1528:	learn: 0.9731387	total: 5m 32s	remaining: 3m 31s
1529:	learn: 0.9731212	total: 5m 33s	remaining: 3m 31s
1530:	learn: 0.9731649	total: 5m 33s	remaining: 3m 31s
1531:	learn: 0.9731474	total: 5m 33s	remaining: 3m 30s
1532:	learn: 0.9731474	total: 5m 33s	remaining: 3m 30s
1533:	learn: 0.9731431	total: 5m 34s	remaining: 3m 30s
1534:	learn: 0.9730601	total: 5m 34s	remaining: 3m 30s
1535:	learn: 0.9730688	total: 5m 34s	remaining: 3m 29s
1536:	learn: 0.9730601	total: 5m 34s	remaining: 3m 29s
1537:	lear

1670:	learn: 0.9760258	total: 6m 3s	remaining: 3m
1671:	learn: 0.9760345	total: 6m 3s	remaining: 3m
1672:	learn: 0.9761000	total: 6m 3s	remaining: 2m 59s
1673:	learn: 0.9761262	total: 6m 4s	remaining: 2m 59s
1674:	learn: 0.9760956	total: 6m 4s	remaining: 2m 59s
1675:	learn: 0.9761131	total: 6m 4s	remaining: 2m 59s
1676:	learn: 0.9760869	total: 6m 4s	remaining: 2m 59s
1677:	learn: 0.9761175	total: 6m 5s	remaining: 2m 58s
1678:	learn: 0.9761524	total: 6m 5s	remaining: 2m 58s
1679:	learn: 0.9761524	total: 6m 5s	remaining: 2m 58s
1680:	learn: 0.9761568	total: 6m 5s	remaining: 2m 58s
1681:	learn: 0.9761961	total: 6m 5s	remaining: 2m 57s
1682:	learn: 0.9761655	total: 6m 6s	remaining: 2m 57s
1683:	learn: 0.9762878	total: 6m 6s	remaining: 2m 57s
1684:	learn: 0.9762922	total: 6m 6s	remaining: 2m 57s
1685:	learn: 0.9763752	total: 6m 6s	remaining: 2m 57s
1686:	learn: 0.9763970	total: 6m 7s	remaining: 2m 56s
1687:	learn: 0.9764276	total: 6m 7s	remaining: 2m 56s
1688:	learn: 0.9765062	total: 6m 7s	

1820:	learn: 0.9789608	total: 6m 35s	remaining: 2m 27s
1821:	learn: 0.9789565	total: 6m 36s	remaining: 2m 27s
1822:	learn: 0.9789390	total: 6m 36s	remaining: 2m 27s
1823:	learn: 0.9789739	total: 6m 36s	remaining: 2m 26s
1824:	learn: 0.9789827	total: 6m 36s	remaining: 2m 26s
1825:	learn: 0.9789783	total: 6m 37s	remaining: 2m 26s
1826:	learn: 0.9789914	total: 6m 37s	remaining: 2m 26s
1827:	learn: 0.9789958	total: 6m 37s	remaining: 2m 26s
1828:	learn: 0.9790089	total: 6m 37s	remaining: 2m 25s
1829:	learn: 0.9790875	total: 6m 37s	remaining: 2m 25s
1830:	learn: 0.9791574	total: 6m 38s	remaining: 2m 25s
1831:	learn: 0.9791836	total: 6m 38s	remaining: 2m 25s
1832:	learn: 0.9792054	total: 6m 38s	remaining: 2m 25s
1833:	learn: 0.9791836	total: 6m 38s	remaining: 2m 24s
1834:	learn: 0.9791880	total: 6m 38s	remaining: 2m 24s
1835:	learn: 0.9791923	total: 6m 39s	remaining: 2m 24s
1836:	learn: 0.9791967	total: 6m 39s	remaining: 2m 24s
1837:	learn: 0.9792011	total: 6m 39s	remaining: 2m 23s
1838:	lear

1971:	learn: 0.9807603	total: 7m 8s	remaining: 1m 54s
1972:	learn: 0.9807385	total: 7m 8s	remaining: 1m 54s
1973:	learn: 0.9807385	total: 7m 8s	remaining: 1m 54s
1974:	learn: 0.9807516	total: 7m 9s	remaining: 1m 54s
1975:	learn: 0.9807516	total: 7m 9s	remaining: 1m 53s
1976:	learn: 0.9807560	total: 7m 9s	remaining: 1m 53s
1977:	learn: 0.9807865	total: 7m 9s	remaining: 1m 53s
1978:	learn: 0.9807909	total: 7m 9s	remaining: 1m 53s
1979:	learn: 0.9808914	total: 7m 10s	remaining: 1m 52s
1980:	learn: 0.9808564	total: 7m 10s	remaining: 1m 52s
1981:	learn: 0.9808258	total: 7m 10s	remaining: 1m 52s
1982:	learn: 0.9808346	total: 7m 10s	remaining: 1m 52s
1983:	learn: 0.9808433	total: 7m 11s	remaining: 1m 52s
1984:	learn: 0.9808258	total: 7m 11s	remaining: 1m 51s
1985:	learn: 0.9808258	total: 7m 11s	remaining: 1m 51s
1986:	learn: 0.9808520	total: 7m 11s	remaining: 1m 51s
1987:	learn: 0.9808564	total: 7m 11s	remaining: 1m 51s
1988:	learn: 0.9808171	total: 7m 12s	remaining: 1m 51s
1989:	learn: 0.980

2121:	learn: 0.9825598	total: 7m 40s	remaining: 1m 22s
2122:	learn: 0.9825642	total: 7m 41s	remaining: 1m 21s
2123:	learn: 0.9825642	total: 7m 41s	remaining: 1m 21s
2124:	learn: 0.9825642	total: 7m 41s	remaining: 1m 21s
2125:	learn: 0.9825642	total: 7m 41s	remaining: 1m 21s
2126:	learn: 0.9825729	total: 7m 41s	remaining: 1m 21s
2127:	learn: 0.9825817	total: 7m 42s	remaining: 1m 20s
2128:	learn: 0.9825860	total: 7m 42s	remaining: 1m 20s
2129:	learn: 0.9825423	total: 7m 42s	remaining: 1m 20s
2130:	learn: 0.9825686	total: 7m 42s	remaining: 1m 20s
2131:	learn: 0.9825904	total: 7m 42s	remaining: 1m 19s
2132:	learn: 0.9825948	total: 7m 43s	remaining: 1m 19s
2133:	learn: 0.9825817	total: 7m 43s	remaining: 1m 19s
2134:	learn: 0.9825380	total: 7m 43s	remaining: 1m 19s
2135:	learn: 0.9825642	total: 7m 43s	remaining: 1m 19s
2136:	learn: 0.9825991	total: 7m 44s	remaining: 1m 18s
2137:	learn: 0.9826384	total: 7m 44s	remaining: 1m 18s
2138:	learn: 0.9826079	total: 7m 44s	remaining: 1m 18s
2139:	lear

2274:	learn: 0.9840536	total: 8m 13s	remaining: 48.8s
2275:	learn: 0.9840536	total: 8m 14s	remaining: 48.6s
2276:	learn: 0.9840885	total: 8m 14s	remaining: 48.4s
2277:	learn: 0.9841147	total: 8m 14s	remaining: 48.2s
2278:	learn: 0.9840841	total: 8m 14s	remaining: 48s
2279:	learn: 0.9840623	total: 8m 14s	remaining: 47.8s
2280:	learn: 0.9840274	total: 8m 15s	remaining: 47.5s
2281:	learn: 0.9840274	total: 8m 15s	remaining: 47.3s
2282:	learn: 0.9841060	total: 8m 15s	remaining: 47.1s
2283:	learn: 0.9841191	total: 8m 15s	remaining: 46.9s
2284:	learn: 0.9841366	total: 8m 16s	remaining: 46.7s
2285:	learn: 0.9841060	total: 8m 16s	remaining: 46.5s
2286:	learn: 0.9841016	total: 8m 16s	remaining: 46.2s
2287:	learn: 0.9841453	total: 8m 16s	remaining: 46s
2288:	learn: 0.9841540	total: 8m 16s	remaining: 45.8s
2289:	learn: 0.9841628	total: 8m 17s	remaining: 45.6s
2290:	learn: 0.9841540	total: 8m 17s	remaining: 45.4s
2291:	learn: 0.9841628	total: 8m 17s	remaining: 45.2s
2292:	learn: 0.9841715	total: 8m

2427:	learn: 0.9849227	total: 8m 47s	remaining: 15.6s
2428:	learn: 0.9849577	total: 8m 47s	remaining: 15.4s
2429:	learn: 0.9849577	total: 8m 47s	remaining: 15.2s
2430:	learn: 0.9849664	total: 8m 47s	remaining: 15s
2431:	learn: 0.9849708	total: 8m 47s	remaining: 14.8s
2432:	learn: 0.9849795	total: 8m 48s	remaining: 14.5s
2433:	learn: 0.9849708	total: 8m 48s	remaining: 14.3s
2434:	learn: 0.9849708	total: 8m 48s	remaining: 14.1s
2435:	learn: 0.9849708	total: 8m 48s	remaining: 13.9s
2436:	learn: 0.9849533	total: 8m 48s	remaining: 13.7s
2437:	learn: 0.9849970	total: 8m 49s	remaining: 13.5s
2438:	learn: 0.9849883	total: 8m 49s	remaining: 13.2s
2439:	learn: 0.9850407	total: 8m 49s	remaining: 13s
2440:	learn: 0.9850625	total: 8m 49s	remaining: 12.8s
2441:	learn: 0.9850669	total: 8m 50s	remaining: 12.6s
2442:	learn: 0.9850756	total: 8m 50s	remaining: 12.4s
2443:	learn: 0.9850843	total: 8m 50s	remaining: 12.2s
2444:	learn: 0.9850843	total: 8m 50s	remaining: 11.9s
2445:	learn: 0.9851105	total: 8m

In [59]:
#Добавление результатов в сводную таблицу.
new_result = {'ML model' : 'LogisticalRegressor, LightGBMClassifier, CatBoostClassifier.',
              'F1_score' : f_score,
              'Learning time':model_6_time}

result_data = result_data.append(new_result, ignore_index=True)
result_data

  result_data = result_data.append(new_result, ignore_index=True)


Unnamed: 0,ML model,F1_score,Learning time
0,"Ансамбль Логистической регрессии, Случайного л...",0.786598,46.0
1,"Ансамбль Логистической регрессии, Случайного л...",0.753363,23.0
2,logisticalRegressor,0.761652,2.0
3,CatBoostClassifier.,0.782025,470.0
4,LGBMClassifier.,0.779751,32.0
5,"LogisticalRegressor, LightGBMClassifier, CatBo...",0.804062,1055.0


### Проверка лучшей модели на тестовой выборке.

Преобразуем признаки в тестовой выброки в соответствии с общим пайплайном.

In [60]:
corpus_test = X_test['lemm_text'].values
# Преобразование тестовых данных в матрицу TF-IDF с использованием уже обученного векторизатора
tf_idf_test = count_tf_idf.transform(corpus_test) 
print(f'Размер матрицы для тестирования: {tf_idf_test.shape}')

Размер матрицы для тестирования: (15930, 143603)


In [61]:
X_test_ans = tf_idf_test

predict = voting_clf.predict(X_test_ans)
f_score = f1_score(predict, y_test)
print(f'Метрика F1 на тестовой выборке: {f_score}')

Метрика F1 на тестовой выборке: 0.7876712328767121


# Вывод.

В работе проведены следующие действия:
- Проведена предобработка исходных текстов в составе:
        1) Лемматезация текста (spaCy).
        2) Удаление стоп слов и служеюных символов (spaCy).
- Добавлены дополнительные признаки по методу TF-IDF (Transform).
- Устранен дисбаланс классов методом upsampling. (imblearn)
- Обучены ряд моделей, с выбором лучших из них, с обучением ансамбля моделей из лучших. Обучены следующие модели:
        1) LogisticalRegressor.
        2) CatBoostClassifier.
        3) LightGBM.
        4) Ансамбль моделей из LinearRegressor, CatBoostClassifier, LightGBM.
        
Лучшей моделью для определения токсичности текста стал ансамбль моделей из LinearRegressor, CatBoostClassifier, LightGBM.

В задаче пороговое значение метрики F1 = 0.75. В проекте удалось достичь значения метрики F1 = 0.787 на тесте, что является очень хорошим результатом для поставленной задачи.