In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import metrics
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
from keras.utils import pad_sequences

In [5]:
strategy  = tf.distribute.get_strategy()

In [6]:
train = pd.read_csv('./data/jigsaw-toxic-comment-train.csv')
test = pd.read_csv('./data/test.csv')
validation = pd.read_csv('./data/validation.csv')

In [7]:
train.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [8]:
train.drop(['severe_toxic','obscene', 'threat','insult', 'identity_hate'], axis=1, inplace = True)

In [9]:
train = train.loc[:12000, :]

In [10]:
train.shape

(12001, 3)

In [11]:
train.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


check một comment có tối đa bao nhiêu chữ

In [12]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

hàm đánh giá mô hình

In [13]:
def roc_auc(predictions, target):
    """
        Method to return  the AUC score when giving the predictions
    """
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

chia tập dữ liệu thành 2 phần, 8 phần train, 2 phần test

In [14]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, stratify= train.toxic.values
                                                  , random_state=42, test_size=0.2, shuffle=True)

In [15]:
token = text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))

In [16]:
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

In [20]:
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

In [21]:
word_index = token.word_index

In [26]:
%%time
with strategy.scope():
    # a simple rnn 
    model = Sequential()
    model.add(Embedding(len(word_index) + 1, 300, input_length = max_len))
    model.add(SimpleRNN(100))
    model.add(Dense(1, activation = "sigmoid"))
    model.compile(loss = "binary_crossentropy",optimizer = "adam", metrics=['accuracy'])
model.summary()
    

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100)               40100     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: total: 46.9 ms
Wall time: 139 ms


In [27]:
model.fit(xtrain_pad, ytrain, epochs=1, batch_size=64*strategy.num_replicas_in_sync)



<keras.callbacks.History at 0x239b13b5de0>

In [28]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores, yvalid)))

Auc: 0.79%
