# Подготовка

In [1]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
import pandas as pd

In [2]:
num_words = 10000
max_comment_len = 50

train = pd.read_csv('C:/Storage/Dataset/Commentary/train.csv')
comments = train['comment_text']
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
x_train = pad_sequences(sequences, maxlen=max_comment_len)

# Построение сети

In [3]:
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 128, input_length=max_comment_len))
model_lstm.add(SpatialDropout1D(0.5))
model_lstm.add(LSTM(40, return_sequences=True))
model_lstm.add(LSTM(40))
model_lstm.add(Dense(6, activation='sigmoid'))

model_lstm.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'AUC']
)

model_lstm_save_path = 'C:/Storage/Net/predict_toxicity/predict_toxicity.h5'
checkpoint_callback_lstm = ModelCheckpoint(
    model_lstm_save_path,
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 128)           1280000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50, 40)            27040     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40)                12960     
_________________________________________________________________
dense (Dense)                (None, 6)                 246       
Total params: 1,320,246
Trainable params: 1,320,246
Non-trainable params: 0
_________________________________________________________________


# Обучение сети

In [None]:
history_lstm = model_lstm.fit(
    x_train,
    y_train,
    epochs=5,
    batch_size=512,
    validation_split=0.2,
    callbacks=[checkpoint_callback_lstm]
)

Epoch 1/5

Epoch 00001: val_accuracy improved from -inf to 0.99402, saving model to /content/drive/MyDrive/БГУИР/5 семестр/Курсовой проект/сhecking_reviews_for_toxicity.h5
Epoch 2/5

Epoch 00002: val_accuracy did not improve from 0.99402
Epoch 3/5

Epoch 00003: val_accuracy improved from 0.99402 to 0.99405, saving model to /content/drive/MyDrive/БГУИР/5 семестр/Курсовой проект/сhecking_reviews_for_toxicity.h5
Epoch 4/5

Epoch 00004: val_accuracy did not improve from 0.99405
Epoch 5/5

Epoch 00005: val_accuracy did not improve from 0.99405


# Тестирование

In [None]:
test_comments = pd.read_csv('C:/Storage/Dataset/Commentary/test.csv')
test_labels = pd.read_csv('C:/Storage/Dataset/Commentary/test_labels.csv')
test_full = pd.merge(test_comments, test_labels, on='id')
test = test_full[test_full['toxic']!=-1]
test_sequences = tokenizer.texts_to_sequences(test['comment_text'])
x_test = pad_sequences(test_sequences, maxlen=max_comment_len)
y_test = test[['toxic',	'severe_toxic',	'obscene', 'threat',	'insult',	'identity_hate']]
model_lstm.load_weights(model_lstm_save_path)
model_lstm.evaluate(x_test, y_test, verbose=1)

# Проверка

In [None]:
comment = "X-BOX 360 SUKCS BIG BUMM AND LIKES IT UP THE ASS"
sequence = tokenizer.texts_to_sequences([comment])
data = pad_sequences(sequence, maxlen=max_comment_len)
result = model_lstm.predict(data)
result

array([[0.9451662 , 0.02548518, 0.6889377 , 0.01368805, 0.5778702 ,
        0.07606437]], dtype=float32)