# Предобработка данных

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.probability import FreqDist
from nltk.util import bigrams, ngrams

import re
import string

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arkuchina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arkuchina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [6]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153164 entries, 0 to 153163
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            153164 non-null  object
 1   comment_text  153164 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


Признак clear означает, что комментарий не относится ни к одному из классов токсичности.

In [8]:
train['clear'] = (train['toxic'] == 0) & (train['severe_toxic'] == 0) & (train['obscene'] == 0) & (train['threat'] == 0) & (train['insult'] == 0) & (train['identity_hate'] == 0)
train['clear'] = train['clear'].astype(int)
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clear
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


Посмотрим есть ли пересечение классов 

In [9]:
train['includes'] = (train['toxic'] + train['severe_toxic'] + train['obscene'] + train['threat'] + train['insult'] + train['identity_hate'])
train['includes'].describe()

count    159571.000000
mean          0.219952
std           0.748260
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           6.000000
Name: includes, dtype: float64

In [10]:
print('Процент строк, попавших в несколько классов:', (len(train.query("includes > 1")) / len(train))) 
# Их не так много, поэтому просто удалим
train = train.query("includes <= 1")

Процент строк, попавших в несколько классов: 0.06182201026502309


Обработка текста сообщений:

In [11]:
stop_words = set(stopwords.words('english') + list(string.ascii_lowercase))

In [12]:
en_stops = set(stopwords.words('english'))
lemmatizer  = WordNetLemmatizer()

In [15]:
def delete_stop_words_lemmatization_punctiation(row):

    row = re.sub(r"\n", "", row.lower())
    row = re.sub(r"[^\w\s]", ' ', row)

    row_list = row.split(' ')
    
    row_list_withut_stops = [word for word in row_list if word not in en_stops]
    text = [lemmatizer.lemmatize(w) for w in row_list_withut_stops]
    return ' '.join(text)

In [16]:
train['clean_text'] = train['comment_text'].apply(delete_stop_words_lemmatization_punctiation)
test['clean_text'] = test['comment_text'].apply(delete_stop_words_lemmatization_punctiation)

In [17]:
# поиск дубликатов 
train['clean_text'].value_counts()

clean_text
                                                                                                                                                                                                                         8
personal attack                                                                                                                                                                                                          4
  unblock request                                                                                                                                                                                                        4
talking                                                                                                                                                                                                                  4
                                                                                                                 

In [18]:
train.drop_duplicates()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clear,includes,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1,0,explanationwhy edits made username hardcore me...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,0,aww match background colour seemingly stuck ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,0,hey man really trying edit war guy constantl...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1,0,morei make real suggestion improvement wond...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,0,sir hero chance remember page
...,...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,1,0,second time asking view completely cont...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,1,0,ashamed horrible thing put talk page 128 61 ...
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,1,0,spitzer umm there actual article prostitution...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,1,0,look like actually put speedy first version de...


In [19]:
# распределение по классам 
print('toxic:', sum(train['toxic']))
print('severe_toxic:', sum(train['severe_toxic']))
print('obscene:', sum(train['obscene']))
print('threat:', sum(train['threat']))
print('insult:', sum(train['insult']))
print('identity_hate:', sum(train['identity_hate']))
print('clear:', sum(train['clear']))

toxic: 5666
severe_toxic: 0
obscene: 317
threat: 22
insult: 301
identity_hate: 54
clear: 143346


In [20]:
# Пайплайн: сначала определяем, токсик или нет, после этого определяем уровень токсичности 
# поэтому добавим 2 лейбла -  токсик/нет - столбец clear; и toxic_type
train['label'] = train['clear']
train['toxic_type'] = train['includes']

In [21]:
for i, row in train.iterrows():
    if row['toxic']:
        row['toxic_type'] = 1
    elif row['obscene']:
        row['toxic_type'] = 2
    elif row['threat']:
        row['toxic_type'] = 3
    elif row['insult']:
        row['toxic_type'] = 4
    elif row['identity_hate']:
        row['toxic_type'] = 5

In [25]:
train.iloc[:, 10:]

Unnamed: 0,clean_text,label,toxic_type
0,explanationwhy edits made username hardcore me...,1,0
1,aww match background colour seemingly stuck ...,1,0
2,hey man really trying edit war guy constantl...,1,0
3,morei make real suggestion improvement wond...,1,0
4,sir hero chance remember page,1,0
...,...,...,...
159566,second time asking view completely cont...,1,0
159567,ashamed horrible thing put talk page 128 61 ...,1,0
159568,spitzer umm there actual article prostitution...,1,0
159569,look like actually put speedy first version de...,1,0


In [26]:
train.iloc[:, 10:].to_csv('data/train_preprocessed.csv', index=False)