In [5]:
import os
import time
import argparse
import numpy as np
from data_loader.datasets import load_data
from evaluate import evaluate_result

In [25]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Input, SpatialDropout1D
from tensorflow.keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, concatenate
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Dense, Embedding, Input, SpatialDropout1D
from config import inputLen

In [26]:
def textcnn1(tokenizer, class_num=2):
    kernel_size = [1, 3, 5]
    acti = 'relu'
    my_input = Input(shape=(inputLen,), dtype='int32')
    emb = Embedding(len(tokenizer.word_index) + 1, 20,
                    input_length=inputLen)(my_input)
    emb = SpatialDropout1D(0.2)(emb)

    net = []
    for kernel in kernel_size:
        con = Conv1D(32, kernel, activation=acti, padding="same", kernel_regularizer=l2(0.0005))(emb)
        con = MaxPool1D(2)(con)
        net.append(con)
    net = concatenate(net, axis=-1)
    # net = concatenate(net)
    net = Flatten()(net)
    net = Dropout(0.5)(net)
    net = Dense(256, activation='relu')(net)
    net = Dropout(0.5)(net)
    net = Dense(class_num, activation='softmax', kernel_regularizer=l2(l=0.001))(net)
    model = Model(inputs=my_input, outputs=net)
    return model

In [27]:
batch_size=20
epoch_num=5
model_option=1
print("Start Train Job! ")
start = time.time()
textcnn1_model_dir='models'
if model_option == 1:
    tokenizer_file_path = os.path.join(textcnn1_model_dir, "tokenizer.pickle")

elif model_option == 2:
    tokenizer_file_path = os.path.join(textcnn2_model_dir, "tokenizer.pickle")

elif model_option == 3:
    tokenizer_file_path = os.path.join(textcnn3_model_dir, "tokenizer.pickle")

else:
    raise Exception("not supported model_option: {}".format(model_option))

Start Train Job! 


In [19]:
train_datas, val_datas, test_datas, train_labels, val_labels, test_labels = load_data(validate=True)

good_df:  (1294531, 2)
bad_df:  (48138, 2)
total:  (98138, 2)
filter label: 1 or 0  (98138, 2)
after drop_duplicates df:  (94636, 2)


In [21]:
from keras_preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

inputLen = 1024  # 256  # 512

def train_tokenizer_with_val(train_datas, val_datas, test_datas, tokenizer_file_path):
    tokenizer = Tokenizer(num_words=None,
                          filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
                          lower=True,
                          split=" ",
                          char_level=False)
    tokenizer.fit_on_texts(train_datas)
    tokenizer.fit_on_texts(val_datas)
    tokenizer.fit_on_texts(test_datas)
    # print(tokenizer.word_index)
    # # vocal = tokenizer.word_index
    train_datas = tokenizer.texts_to_sequences(train_datas)
    val_datas = tokenizer.texts_to_sequences(val_datas)
    test_datas = tokenizer.texts_to_sequences(test_datas)
    train_datas = pad_sequences(
        train_datas, inputLen, padding='post', truncating='post')
    val_datas = pad_sequences(
        val_datas, inputLen, padding='post', truncating='post')
    test_datas = pad_sequences(
        test_datas, inputLen, padding='post', truncating='post')

    with open(tokenizer_file_path, "wb") as tokenizer_file:
        pickle.dump(tokenizer, tokenizer_file)

    return tokenizer, train_datas, val_datas, test_datas

tokenizer, train_datas, val_datas, test_datas = train_tokenizer_with_val(train_datas, val_datas, test_datas, tokenizer_file_path)

In [28]:
class_num = 2

if model_option == 1:
    model = textcnn1(tokenizer, class_num)
    model_save_path = os.path.join(textcnn1_model_dir, 'model.h5')

elif model_option == 2:
    model = textcnn2(tokenizer, class_num)
    model_save_path = os.path.join(textcnn2_model_dir, 'model.h5')

elif model_option == 3:
    model = textcnn3(tokenizer, class_num)
    model_save_path = os.path.join(textcnn3_model_dir, 'model.h5')

else:
    raise Exception("not supported model_option: {}".format(model_option))

print(model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1024, 20)     1227760     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, 1024, 20)     0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 1024, 32)     672         spatial_dropout1d[0][0]          
_______________________________________________________________________________________

In [31]:
# optimizer = Adam(learning_rate=1e-3)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='min', baseline=None,restore_best_weights=True)
model.fit(train_datas, train_labels, epochs=epoch_num, batch_size=batch_size,
          validation_data=(val_datas, val_labels), callbacks=[checkpoint, earlystop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1ed52089b48>

In [32]:
end = time.time()
print("Over train job in %f s" % (end-start))

Over train job in 3848.699727 s


In [33]:
model.load_weights(model_save_path)
labels_true = test_labels
print("labels_true: ", labels_true.shape)

labels_true:  (9464, 2)


In [34]:
labels_pre = model.predict(test_datas)
print("labels_pre: ", labels_pre.shape)

labels_pre:  (9464, 2)


In [35]:
labels_pre = np.array(labels_pre).round()

def to_y(labels):
    y = []
    for i in range(len(labels)):
        label = labels[i]

        if label[0] == 1:
            y.append(0)

        elif label[1] == 1:
            y.append(1)

        else:
            raise Exception("not supported result: {}".format(label))

    return y

In [36]:
y_true = to_y(labels_true)
y_pre = to_y(labels_pre)

evaluate_result(y_true, y_pre)

Accuracy Score is:  0.9792899408284024
Precision Score is : 0.9828086626479124
Recall Score is : 0.9736783897367839
F1 Score:  0.9782222222222223
AUC Score:  0.9790504026369534
