# Проект для «Викишоп»

Интернет-магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. Нужно обучить модель классифицировать комментарии на позитивные и негативные. 
Постройте модель со значением метрики качества *F1* не меньше 0.75. 

## Подготовка

In [27]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from pymystem3 import Mystem
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve
import warnings
from sklearn.utils import shuffle
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
warnings.filterwarnings('ignore')


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
comments = pd.read_csv('/datasets/toxic_comments.csv')

comments

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [14]:
#типы и пропуски

comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
text     159571 non-null object
toxic    159571 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [15]:
#дубликаты

comments.duplicated().sum()

0

In [16]:
%%time

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    cleared_list = re.sub(r'[^a-zA-Z]', ' ', text)
    clear_text=cleared_list.split()
    join_clear=" ".join(clear_text)
    
    lemm_list = lemmatizer.lemmatize(join_clear)
    lemm_text = "".join(lemm_list)
        
    return lemm_text

comments['lemm_text'] = comments['text'].apply(lemmatize_text)

comments = comments.drop(['text'], axis=1)

CPU times: user 8.77 s, sys: 134 ms, total: 8.91 s
Wall time: 8.91 s


In [18]:
#Разобьем выборку по отношению 60/20/20.

target = comments['toxic']
features = comments.drop(['toxic'], axis=1)

features_train, features_valid, target_train, target_valid = train_test_split(features, 
                                                                              target, 
                                                                              test_size=0.4, 
                                                                              random_state=12345)
features_valid, features_test, target_valid, target_test = train_test_split(features_valid, 
                                                                            target_valid, 
                                                                            test_size=0.5,
                                                                            random_state=12345)



In [19]:
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('english'))

count_tf_idf = TfidfVectorizer(stop_words=stopwords)

features_train = count_tf_idf.fit_transform(features_train['lemm_text'])


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
features_valid = count_tf_idf.transform(features_valid['lemm_text'])


In [21]:
features_test = count_tf_idf.transform(features_test['lemm_text'])
print(features_train.shape)
print(features_valid.shape)
print(features_test.shape)

(95742, 125610)
(31914, 125610)
(31915, 125610)


In [22]:
#проверяем баланс классов

display(comments['toxic'].value_counts())
ratio = comments['toxic'].value_counts()[0] / comments['toxic'].value_counts()[1]

ratio

0    143346
1     16225
Name: toxic, dtype: int64

8.834884437596301

In [25]:
#изменение баланса классов


dict_classes={0:1, 1:ratio}
classificator = LogisticRegression(class_weight=dict_classes)
train_f1_ballanced = cross_val_score(classificator, 
                                    features_train, 
                                    target_train, 
                                    cv=3, 
                                    scoring='f1').mean()
print('F1 на CV с балансированными классами', train_f1_ballanced)

F1 на CV с балансированными классами 0.7514247243676125


In [23]:
%%time

classificator = LogisticRegression(class_weight='balanced')
cvs = cross_val_score(classificator, 
                                    features_train, 
                                    target_train, 
                                    cv=3, 
                                    scoring='f1').mean()
print('F1 на CV с балансированными классами', cvs)

F1 на CV с балансированными классами 0.7447119358373469
CPU times: user 18.5 s, sys: 14.2 s, total: 32.7 s
Wall time: 32.7 s


In [28]:
#ресемплирование с уменьшением класса 0

comments_train = comments.iloc[target_train.index]

target_train_class_zero = comments_train[comments_train['toxic'] == 0]['toxic']
target_train_class_one = comments_train[comments_train['toxic'] == 1]['toxic']


target_train_class_zero_downsample = target_train_class_zero.sample(target_train_class_one.shape[0],
                                                                    random_state=12345)
target_train_downsample = pd.concat([target_train_class_zero_downsample, target_train_class_one])

features_train_downsample = comments.iloc[target_train_downsample.index]
features_train_downsample, target_train_downsample = shuffle(features_train_downsample,
                                                             target_train_downsample,
                                                             random_state=12345)
features_train_downsample = count_tf_idf.transform(features_train_downsample['lemm_text']
                                                   .values.astype('U'))
del count_tf_idf
del stopwords

In [29]:
model = LogisticRegression()
model.fit(features_train_downsample,target_train_downsample) 

predicted=model.predict(features_train_downsample)
target_predict = model.predict(features_valid)
scores_train= f1_score(target_train_downsample, predicted)

scores_valid= f1_score(target_valid, target_predict)
print('F1 на cv с уменьшением классов', scores_train)
print('F1 на валидации с уменьшением классов', scores_valid)

F1 на cv с уменьшением классов 0.9418841868494011
F1 на валидации с уменьшением классов 0.6952191235059761


## Обучение

In [24]:
%%time

model = LogisticRegression()
model.fit(features_train,target_train) 
predicted=model.predict(features_train)
target_predict = model.predict(features_valid)
scores=cross_val_score(model,features_train,target_train,cv=3,scoring='f1').mean()

scores_valid= f1_score(target_valid, target_predict)
print('F1 на cv', scores)
print('F1 на валидации', scores_valid)

F1 на cv 0.6735400940998678
F1 на валидации 0.7272385252069224
CPU times: user 15.9 s, sys: 16.1 s, total: 32 s
Wall time: 32 s


In [23]:
%%time

model = CatBoostClassifier(verbose=False, iterations=200)
model.fit(features_train, target_train)
target_predict = model.predict(features_valid)
cv_f1_CBC = cross_val_score(model,
                                         features_train, 
                                         target_train, 
                                         cv=3, 
                                         scoring='f1').mean()
valid_f1_CBC = f1_score(target_valid, target_predict)
print('F1 на cv', cv_f1_CBC)
print('F1 на валидации', valid_f1_CBC)

F1 на cv 0.7241592487621448
F1 на валидации 0.7416954075149755
CPU times: user 37min 33s, sys: 7min 22s, total: 44min 55s
Wall time: 45min 7s


In [32]:
model = CatBoostClassifier(verbose=False, iterations=200)
model.fit(features_train, target_train)
test_predict = model.predict(features_test)

test_f1 = f1_score(target_test, test_predict)

print('F1 на тестовой выборке', test_f1)

F1 на тестовой выборке 0.7347089167280768


In [31]:
model = LogisticRegression()
model.fit(features_train,target_train) 
test_predict = model.predict(features_test)

test_f1 = f1_score(target_test, test_predict)
print('F1 на тестовой выборке', test_f1)

F1 на тестовой выборке 0.7105465742879138


In [32]:
%%time

model = LogisticRegression()
hyperparams = [{'solver':['newton-cg', 'lbfgs', 'liblinear'],
                'C':[0.1, 1, 10],
                'class_weight':[dict_classes]}]


clf = GridSearchCV(model, hyperparams, scoring='f1',cv=3)
clf.fit(features_train, target_train)
print("Лучшие параметры:", clf.best_params_)




Лучшие параметры: {'C': 10, 'class_weight': {0: 1, 1: 8.834884437596301}, 'solver': 'lbfgs'}
CPU times: user 6min 14s, sys: 6min 17s, total: 12min 32s
Wall time: 12min 32s


In [33]:
model = LogisticRegression()
model.set_params(**clf.best_params_)
model.fit(features_train, target_train)
predict_test = model.predict(features_test)

print('F1:', f1_score(target_test, predict_test))


F1: 0.7661913250148544


## Выводы

В ходе работы над проектом было сделано:

1.Загружены и подготовлены данные
2.Данные поделены на обучающую, валидационную, тестовую выборки
3.Поделены данные на обучающую, валидационную и тестовою выборку.
4.Выполнено ресемплирование с уменьшением класса 0
5.Обучены модели - CatBoostClassifier,LogisticRegression

лучший результат показала модель CatBoostClassifier на валидационной выборке


На тестовой выборке метрика  F1 =0.73

