# Wikishop  
## Анализ комментариев на токсичность
### Используя модель логистической регрессии и LightGBM поможем выделить "токсичные" комментарии в корпусе текстов


[1. Загрузка библиотек](#data_download)    
[2. Загурзка данных. Предобработка](#data_preprocessing)  
[3. Логистическая регрессия](#logistic)    
[4. LightGBM](#lightGBM)  
[5. Выводы](#Conclusion)    


<a id='data_download'></a>
### 1. Загрузим необходимые библиотеки

In [None]:
import pandas as pd
import os
import numpy as np


import re
import string

#import torch
#import transformers 

import optuna
import lightgbm as lgb



from sklearn.model_selection import GridSearchCV


import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score, mean_squared_error

import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliagrobman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
!pip install transformers



In [None]:
pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


<a id='data_preprocessing'></a>
### 2. Загрузка данных. Предобработка

In [None]:
path_global = '/datasets/toxic_comments.csv'
path_local = '/Users/juliagrobman/Downloads/toxic_comments (1).csv'

if os.path.exists(path=path_global):
    data = pd.read_csv(path_global)
elif os.path.exists(path=path_local):
    data = pd.read_csv(path_local)
else: 
    print('ERROR IN PATH')

In [None]:
data.head()

Unnamed: 0,text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [None]:
data['text'] = data['text'].astype('U')

### Наш датасет состоят из 160 тыс строк, столбец 'toxic' содержит целевой признак

### Text preprocessing

#### Уберем из текста знаки пунктуации, стоп-слова, пробелы

In [None]:
def  preprocess_text(text, stopwords):
    # cleaning punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # text to lower
    text = text.lower()

    # removing stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])

    # removing whitespaces
    text = re.sub(r'\s', ' ', text).strip()

    return text


In [None]:
def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [None]:
data['clean'] = data['text'].apply(lambda x: preprocess_text(x, stopwords=stop_words))

#### Применим лемматизацию

In [187]:
data['lemmatized'] = data['clean'].apply(lambda x: lemmatize_text(x) )

In [188]:
data.head(20)

Unnamed: 0,text,toxic,clean,lemmatized
0,Explanation\nWhy the edits made under my usern...,0,explanation edits made username hardcore metal...,explanation edit make username hardcore metall...
1,D'aww! He matches this background colour I'm s...,0,daww matches background colour im seemingly st...,daww match background colour I m seemingly stu...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man im really trying edit war guy constant...,hey man I m really try edit war guy constantly...
3,"""\nMore\nI can't make any real suggestions on ...",0,cant make real suggestions improvement wondere...,can not make real suggestion improvement wonde...
4,"You, sir, are my hero. Any chance you remember...",0,sir hero chance remember page thats,sir hero chance remember page that s
5,"""\n\nCongratulations from me as well, use the ...",0,congratulations well use tools well · talk,congratulation well use tool well · talk
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,cocksucker piss around work,cocksucker piss around work
7,Your vandalism to the Matt Shirvington article...,0,vandalism matt shirvington article reverted pl...,vandalism matt shirvington article revert plea...
8,Sorry if the word 'nonsense' was offensive to ...,0,sorry word nonsense offensive anyway im intend...,sorry word nonsense offensive anyway I m inten...
9,alignment on this subject and which are contra...,0,alignment subject contrary dulithgow,alignment subject contrary dulithgow


### Разделим данные на обучающую, валидационную и тестовую выборки

In [189]:
features = data['lemmatized']
target = data['toxic']

In [190]:
features_train_full, features_valid, target_train_full, target_valid = train_test_split(features, target, test_size=0.2, shuffle=True)
features_train, features_test, target_train, target_test = train_test_split(features_train_full, target_train_full, test_size=0.2, shuffle=True)

#### Применим if-idf для векторизации текстов

In [191]:
count_tf_idf = TfidfVectorizer()
features_train = count_tf_idf.fit_transform(features_train)

In [192]:
features_valid = count_tf_idf.transform(features_valid)
features_test = count_tf_idf.transform(features_test)

<a id='logistic'></a>
### 3. Логистическая регрессия

#### Обучим логистическую регрессию с гиперпараметрами

In [193]:
params = {
    'penalty' : ['l1','l2'],
    'C': np.logspace(-3,3,7),
    'solver':['lbfg', 'liblinear']
}

model = LogisticRegression(random_state=12345)
grid = GridSearchCV(model, param_grid=params, scoring='f1')
grid.fit(features_train, target_train)
predictions = grid.predict(features_valid)
score = f1_score(predictions, target_valid)
score

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 433, in _check_solver
    raise ValueError("Logistic Regression supports only solvers in %s, got"
ValueError: Logistic Regression supports only solvers in ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], got lbfg.



0.7732712765957447

In [194]:
grid.best_params_

{'C': 10.0, 'penalty': 'l1', 'solver': 'liblinear'}

In [195]:
# Проверим на тестовой выборке:
predictions = grid.predict(features_test)


In [196]:
score_test = f1_score(target_test, predictions)
print('F1 score {:.2f}'.format(score_test))

F1 score 0.77


#### Попробуем проверить нашу модель на "токсичном" и "нетоксичном" предложении

In [197]:
test_sent = 'look at yourself, you are a fat, black, sodding idiot, thinking just about money'

In [198]:
test_sent_0 = 'I will surely recommend this product to my family and friends, thanks!'

In [199]:
test_sent = preprocess_text(test_sent, stop_words)
test_sent = lemmatize_text(test_sent)

In [200]:
test_sent_0 = preprocess_text(test_sent_0, stop_words)
test_sent_0 = lemmatize_text(test_sent_0)

In [201]:
test_sent_0

'surely recommend product family friend thank'

In [202]:
test = count_tf_idf.transform([test_sent])

In [203]:
test_0 = count_tf_idf.transform([test_sent_0])

In [204]:
grid.predict(test)

array([1])

In [205]:
grid.predict(test_0)

array([0])

### Наша модель хорошо предсказывает токсичные комментарии, f1 выше трешхолда в 0.75

<a id='lightGBM'></a>
### 4.Попробуем другую модель: LightGBM

In [206]:
## Попробуем LightGBM

train_data  = lgb.Dataset(features_train, label=target_train)
test_data = lgb.Dataset(features_test, label=target_test)

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric' : 'auc',
    'boosting': 'dart',
    'num_leaves': 50,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

classifier = lgb.train(
    parameters,
    train_data,
    valid_sets=test_data,
    num_boost_round=1000,
    early_stopping_rounds=100
    
)
predict = classifier.predict(features_valid)




You can set `force_col_wise=true` to remove the overhead.
[1]	valid_0's auc: 0.809626
[2]	valid_0's auc: 0.876043




[3]	valid_0's auc: 0.900458
[4]	valid_0's auc: 0.909067
[5]	valid_0's auc: 0.921281
[6]	valid_0's auc: 0.924341
[7]	valid_0's auc: 0.926004
[8]	valid_0's auc: 0.926537
[9]	valid_0's auc: 0.927568
[10]	valid_0's auc: 0.929274
[11]	valid_0's auc: 0.930684
[12]	valid_0's auc: 0.931021
[13]	valid_0's auc: 0.932074
[14]	valid_0's auc: 0.932588
[15]	valid_0's auc: 0.933744
[16]	valid_0's auc: 0.933953
[17]	valid_0's auc: 0.933855
[18]	valid_0's auc: 0.934769
[19]	valid_0's auc: 0.935645
[20]	valid_0's auc: 0.93659
[21]	valid_0's auc: 0.936995
[22]	valid_0's auc: 0.936921
[23]	valid_0's auc: 0.937333
[24]	valid_0's auc: 0.937876
[25]	valid_0's auc: 0.938012
[26]	valid_0's auc: 0.9384
[27]	valid_0's auc: 0.939101
[28]	valid_0's auc: 0.939205
[29]	valid_0's auc: 0.939199
[30]	valid_0's auc: 0.939669
[31]	valid_0's auc: 0.940241
[32]	valid_0's auc: 0.94081
[33]	valid_0's auc: 0.940773
[34]	valid_0's auc: 0.941209
[35]	valid_0's auc: 0.94118
[36]	valid_0's auc: 0.941249
[37]	valid_0's auc: 0.9413

In [None]:
pred = [1  if el > 0.4 else 0 for el in list(predict)]

In [None]:
score_light = f1_score(target_valid, pred)

In [None]:
print('F1 score по LightGBM {:.2f}'.format(score_light))

F1 score по LightGBM 0.78


In [None]:
pred_test = classifier.predict(features_test)

In [None]:
def best_f1_score
pred_test_light =  [1  if el > 0.4 else 0 for el in list(pred_test)]

In [None]:
# попробуем LigthGBM на тестовой выборке
score_light_test = f1_score(target_test, pred_test_light)
score_light_test

0.7985777034093287

In [None]:
print('F1 score на тестовой выборке по LightGBM {:.2f}'.format(score_light_test))

F1 score на тестовой выборке по LightGBM 0.80


### Получилось на тестовой выборке немного улучшить показатели логистической регрессии, F1 мера составляет чуть менее 0.8

<a id='Conclusion'></a>
### 5. Выводы
* Зазрузили данные с необработанными комментариями
* Провели предобработку текстов: 
    - убрали знаки пунктуации, стоп-слова, привели к нижнему регистру
    - провели лемматизацию
    - с помощью if-idf провели векторизацию
* Обучили модели Логистической регрессии и LightGBM с различными гиперпараметрами
* Добились f1-score на тестовой выборке 0.80  
* Проверили модели на токсчичном и нетоксичном предложениях, убедились, что модель адекватно предсказывает "токсчичность"