In [1]:
import pandas as pd
import json, os
import numpy as np

from keras.models import Model, Sequential
from keras.layers import *
from keras.layers.merge import concatenate
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint
from attlayer import AttentionWeightedAverage
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report

Using TensorFlow backend.


In [2]:
def elsa_doc_model(hidden_dim = 64, dropout = 0.5, mode = 'train'):
    I_en = Input(shape=(nb_maxlen[0], nb_feature[1]), dtype='float32')
    en_out = AttentionWeightedAverage()(I_en)
    I_ot = Input(shape=(nb_maxlen[1], nb_feature[0]), dtype='float32')
    jp_out = AttentionWeightedAverage()(I_ot)
    O_to = concatenate([jp_out, en_out])
    O_to = Dense(hidden_dim, activation='selu')(O_to)
    if mode == 'train':
        O_to = Dropout(dropout)(O_to)
    O_out = Dense(1, activation='sigmoid', name='softmax')(O_to)
    model = Model(inputs=[I_ot, I_en], outputs=O_out)
    return model

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

cur_lan = "jp"
cur_cat = "_music"
cur_test = "en_{:s}/".format(cur_lan)
#nb_feature = [2348, 2304] # embedding shape for other language and english, please do not change
nb_feature = [2248, 2248] # embedding shape for other language and english, please do not change
nb_maxlen = [20, 20] # max number of sentences in document
label_path = "../dataset/Amazon review/"
embed_path = "./embed/"

In [9]:
weigh_path = "./ckpt/elsa_doc_{:s}_{:s}.hdf5".format(cur_cat[1:], cur_test[-3:-1])
batch_size = 32
epochs = 10
hidden_dim = 64
dropout = 0.5

mode = "test"
train_chose = True
pretrained_path = "./ckpt/elsa_doc_{:s}_{:s}.hdf5".format(cur_cat[1:], cur_test[-3:-1])

In [5]:
labes = {"en_test_review":[],
         "en_train_review":[], 
         cur_test[-3:-1]+"_test_review":[],
         cur_test[-3:-1]+"_train_review":[]}

tags = ["en_test_review",
        "en_train_review",
        cur_test[-3:-1]+"_test_review",
        cur_test[-3:-1]+"_train_review"]

filename = [label_path+cur_test+"en/"+cur_cat[1:]+"_test_review.tsv",
            label_path+cur_test+"en/"+cur_cat[1:]+"_train_review.tsv",
            label_path+cur_test+cur_test[-3:]+cur_cat[1:]+"_test_review.tsv",
            label_path+cur_test+cur_test[-3:]+cur_cat[1:]+"_train_review.tsv"]

for i, file in enumerate(filename):
    data = open(file, "r")
    for line in data:
        tmp_data = line.strip().split("\t")
        rating = int(tmp_data[0])
        if rating > 3:
            labes[tags[i]].append(1)
        else:
            labes[tags[i]].append(0)
    data.close()

# tidy elsa_embedding
elsa_embedding = {x:[np.array([]), np.array([])] for x in tags}

def roundup(x):
    import math
    return int(math.ceil(x / 10.0)) * 10

for tag in tags:
    tmp_tag = tag[:2] + cur_cat + tag[2:]
    vec = np.load(embed_path+cur_test+ cur_test[-3:]+tmp_tag+"_embed.npz.npy", allow_pickle=True)
    vec = sequence.pad_sequences(vec, dtype=np.float32, maxlen=nb_maxlen[0])
    elsa_embedding[tag][0] = vec   

for tag in tags:
    tmp_tag = tag[:2] + cur_cat + tag[2:]
    vec = np.load(embed_path+cur_test+"en/"+tmp_tag+"_embed.npz.npy", allow_pickle=True)
    vec = sequence.pad_sequences(vec, dtype=np.float32, maxlen=nb_maxlen[1])
    elsa_embedding[tag][1] = np.array(vec)
    print(tag, tmp_tag, vec.shape, vec[0].shape)

en_test_review en_music_test_review (2000, 20, 2248) (20, 2248)
en_train_review en_music_train_review (2000, 20, 2248) (20, 2248)
jp_test_review jp_music_test_review (2000, 20, 2248) (20, 2248)
jp_train_review jp_music_train_review (2000, 20, 2248) (20, 2248)


In [6]:
# train elsa_doc model
elsa_doc = elsa_doc_model( hidden_dim=hidden_dim, dropout=dropout, mode=mode )
elsa_doc.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 20, 2248)     0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 20, 2248)     0                                            
__________________________________________________________________________________________________
attention_weighted_average_2 (A (None, 2248)         2248        input_2[0][0]                    
__________________________________________________________________________________________________
attention_weighted_av

In [10]:
if mode == 'train':
    ck = ModelCheckpoint(filepath=weigh_path, verbose=0, save_best_only=True, monitor='val_acc')
    elsa_doc.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    test_chose = train_chose
    tmp_x = elsa_embedding['en_train_review'] if test_chose else elsa_embedding['en_test_review']
    tmp_y = labes['en_train_review'] if test_chose else labes['en_test_review']
    test_x = elsa_embedding['en_test_review'] if test_chose else elsa_embedding['en_train_review']
    test_y = labes['en_test_review'] if test_chose else labes['en_train_review']
    elsa_doc.fit([tmp_x[0], tmp_x[1]], tmp_y, batch_size=batch_size, epochs=epochs, validation_data=([test_x[0], test_x[1]], test_y), verbose=True, callbacks=[ck])
else:
    elsa_doc.load_weights(filepath=pretrained_path)
    test_x = elsa_embedding[cur_test[-3:-1:]+'_test_review']
    test_y = labes[cur_test[-3:-1:]+'_test_review']
    predict_total = elsa_doc.predict([test_x[0], test_x[1]])
    predict_total = [int(x > 0.5) for x in predict_total]
    acc = accuracy_score(predict_total, test_y)
    print("%s %s Test Accuracy: %s\n" %  (cur_test[:-1], cur_cat[1:], acc))
    print(classification_report(test_y, predict_total))

en_jp music Test Accuracy: 0.7835

              precision    recall  f1-score   support

           0       0.74      0.89      0.80      1000
           1       0.86      0.68      0.76      1000

    accuracy                           0.78      2000
   macro avg       0.80      0.78      0.78      2000
weighted avg       0.80      0.78      0.78      2000

