In [1]:
import pandas  as pd
import numpy as np
import re
import string
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Embedding, LSTM,  Flatten, Input,MaxPooling1D,  GlobalMaxPooling1D,Flatten, Dense, Dropout ,Conv1D
from sklearn.metrics import classification_report, confusion_matrix
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from sklearn.metrics import roc_auc_score , multilabel_confusion_matrix, accuracy_score
from nltk.corpus import wordnet 
import random

In [2]:
print('Načítanie dát...')
data= pd.read_csv(r'data_preprocess.csv')

Načítanie dát...


In [3]:
x = data["comment_text"].fillna("fillna")
y = data["target"].values
SEED = 42
x_train,x_test, y_train ,y_test= train_test_split(x, y, test_size=0.3, random_state=SEED)

print ("Celkový počet trénovacich príkladov je {0} z toho  {1:.2f}% je netoxických a {2:.2f}% je toxických komentárov".format(len(x_train),
                      (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,(len(x_train[y_train == 1]) / (len(x_train)*1.))*100))

print ("Celkový počet testovacich príkladov je {0} z toho {1:.2f}% je netoxických a {2:.2f}% je toxických komentárov".format(len(x_test),
                      (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Celkový počet trénovacich príkladov je 1263411 z toho  92.03% je netoxických a 7.97% je toxických komentárov
Celkový počet testovacich príkladov je 541463 z toho 91.94% je netoxických a 8.06% je toxických komentárov


In [4]:
EMBEDDING_DIM = 300 
max_features = 50000 
max_length = 200

In [6]:
print("Načítanie slovníka GloVe")
EMBEDDING_FILE = 'glove.840B.300d.txt' 
embeddings_index = {}
f = open(os.path.join('',EMBEDDING_FILE), encoding = "utf-8")
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

Načítanie slovníka GloVe


In [7]:
print("Tokenizacia")
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
sequences_train = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)

Tokenizacia


In [8]:
x_train_seq = pad_sequences(sequences_train, maxlen=max_length) 
x_test_seq = pad_sequences(sequences_test, maxlen=max_length)

In [9]:
word_index = tokenizer.word_index
print("Emmbedings matica....")
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(num_words)
embedding_matrix.shape 

Emmbedings matica....
328833


(328833, 300)

In [None]:
####MODELOVANIE 
model= Sequential()
e = Embedding(num_words,  EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_length)
model.add(e)
model.add(Conv1D(filters=64, kernel_size=1, padding='valid', activation='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(filters=64, kernel_size=1, padding='valid', activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

saved_model = "model7030.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor='val_acc', verbose=1, save_best_only=True, mode='max')


print('Trénovanie modelu...')
history = model.fit(x_train_seq, y_train, batch_size=32, epochs=2, callbacks=[checkpoint], validation_split=0.1)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 300)          98649900  
_________________________________________________________________
conv1d (Conv1D)              (None, 200, 64)           19264     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 100, 64)           4160      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 50, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 3200)              0         
_________________________________________________________________
dropout (Dropout)            (None, 3200)              0

In [12]:
print("Načítanie modelu....")
model = load_model('model7030.hdf5')
print("Vyhodnotenie...")
y_pred = model.predict(x_test_seq)
print('Roc auc skóre je {}'.format(roc_auc_score(y_test,y_pred)))

y_int = np.zeros_like(y_pred)
y_int[y_pred > 0.5] = 1
print('Úspešnosť je {}'.format(accuracy_score(y_test,y_int)))
print(classification_report(y_test, y_int, zero_division=0))
print("Kontigenčná tabuľka")
print(confusion_matrix(y_test, y_int))

Načítanie modelu....
Vyhodnotenie...
Roc auc skóre je 0.9478020227619723
Úspešnosť je 0.9470397792646958
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    497834
           1       0.78      0.48      0.59     43629

    accuracy                           0.95    541463
   macro avg       0.87      0.73      0.78    541463
weighted avg       0.94      0.95      0.94    541463

Kontigenčná tabuľka
[[491988   5846]
 [ 22830  20799]]
