In [17]:
import h5py
import keras
import numpy as np
import json
import os
import uuid
import yaml

from attlayer import AttentionWeightedAverage
#from avglayer import MaskAverage
from copy import deepcopy
#from finetuning import (sampling_generator, finetuning_callbacks)
from operator import itemgetter
#from global_variables import NB_TOKENS, NB_EMOJI_CLASSES
from keras.layers import *
from keras.layers.merge import concatenate
from keras.layers import Input, Bidirectional, Embedding, Dense, Dropout, SpatialDropout1D, LSTM, Activation
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.regularizers import L1L2 
from pathlib import Path
from sklearn.metrics import classification_report, recall_score, precision_score, f1_score
from os.path import exists

In [18]:
def elsa_architecture(nb_classes, nb_tokens, maxlen, feature_output=False, embed_dropout_rate=0, final_dropout_rate=0, embed_dim=300,
                    embed_l2=1E-6, return_attention=False, load_embedding=False, pre_embedding=None, high=False, LSTM_hidden=512, LSTM_drop=0.5):
    """
    Returns the DeepMoji architecture uninitialized and
    without using the pretrained model weights.
    # Arguments:
        nb_classes: Number of classes in the dataset.
        nb_tokens: Number of tokens in the dataset (i.e. vocabulary size).
        maxlen: Maximum length of a token.
        feature_output: If True the model returns the penultimate
                        feature vector rather than Softmax probabilities
                        (defaults to False).
        embed_dropout_rate: Dropout rate for the embedding layer.
        final_dropout_rate: Dropout rate for the final Softmax layer.
        embed_l2: L2 regularization for the embedding layerl.
        high: use or not the highway network
    # Returns:
        Model with the given parameters.
    """
    class NonMasking(Layer):   
        def __init__(self, **kwargs):   
            self.supports_masking = True  
            super(NonMasking, self).__init__(**kwargs)   

        def build(self, input_shape):   
            input_shape = input_shape   

        def compute_mask(self, input, input_mask=None):   
            # do not pass the mask to the next layers   
            return None   

        def call(self, x, mask=None):   
            return x   

        def get_output_shape_for(self, input_shape):   
            return input_shape 
    # define embedding layer that turns word tokens into vectors
    # an activation function is used to bound the values of the embedding
    model_input = Input(shape=(maxlen,), dtype='int32')
    embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None

    if not load_embedding and pre_embedding is None:
        embed = Embedding(input_dim=nb_tokens, output_dim=embed_dim, mask_zero=True,input_length=maxlen,embeddings_regularizer=embed_reg,
                          name='embedding')
    else:
        embed = Embedding(input_dim=nb_tokens, output_dim=embed_dim, mask_zero=True,input_length=maxlen, weights=[pre_embedding],
                          embeddings_regularizer=embed_reg,trainable=True, name='embedding')
    if high:
        x = NonMasking()(embed(model_input))
    else:
        x = embed(model_input)
    x = Activation('tanh')(x)

    # entire embedding channels are dropped out instead of the
    # normal Keras embedding dropout, which drops all channels for entire words
    # many of the datasets contain so few words that losing one or more words can alter the emotions completely
    if embed_dropout_rate != 0:
        embed_drop = SpatialDropout1D(embed_dropout_rate, name='embed_drop')
        x = embed_drop(x)

    # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
    # ordering of the way the merge is done is important for consistency with the pretrained model
    lstm_0_output = Bidirectional(LSTM(LSTM_hidden, return_sequences=True, dropout=LSTM_drop), name="bi_lstm_0" )(x)
    lstm_1_output = Bidirectional(LSTM(LSTM_hidden, return_sequences=True, dropout=LSTM_drop), name="bi_lstm_1" )(lstm_0_output)
    x = concatenate([lstm_1_output, lstm_0_output, x])
    if high:
        x = TimeDistributed(Highway(activation='tanh', name="high"))(x)
    # if return_attention is True in AttentionWeightedAverage, an additional tensor
    # representing the weight at each timestep is returned
    weights = None
    x = AttentionWeightedAverage(name='attlayer', return_attention=return_attention)(x)
    #x = MaskAverage(name='attlayer', return_attention=return_attention)(x)
    if return_attention:
        x, weights = x

    if not feature_output:
        # output class probabilities
        if final_dropout_rate != 0:
            x = Dropout(final_dropout_rate)(x)

        if nb_classes > 2:
            outputs = [Dense(nb_classes, activation='softmax', name='softmax')(x)]
        else:
            outputs = [Dense(1, activation='sigmoid', name='softmax')(x)]
    else:
        # output penultimate feature vector
        outputs = [x]

    if return_attention:
        # add the attention weights to the outputs if required
        outputs.append(weights)

    return Model(inputs=[model_input], outputs=outputs)

In [19]:
os.environ['CUDA_VISIBLE_DEVICES'] = "2"
cur_lan = "elsa_pt"
maxlen = 20
batch_size = 250
lr = 0.001
epoch_size = 25000
nb_epochs = 1000
patience = 1
checkpoint_weight_path = "./ckpt"
loss = "categorical_crossentropy"
optim = "adam"
vocab_path = "/data/elsa"
nb_classes=64

In [20]:
LSTM_hidden = 512
LSTM_drop = 0.5
final_dropout_rate = 0.5
embed_dropout_rate = 0.0
high = False
load_embedding = True
embed_dim = 200

In [59]:
steps = int(epoch_size/batch_size)

wv_path = Path(vocab_path).joinpath("{:s}_wv.npy".format(cur_lan)).as_posix()
X_path = Path(vocab_path).joinpath("{:s}_X.npy".format(cur_lan)).as_posix()
y_path = Path(vocab_path).joinpath("{:s}_y.npy".format(cur_lan)).as_posix()

word_vec = np.load(wv_path, allow_pickle=True)
input_vec, input_label = np.load(X_path, allow_pickle=True), np.load(y_path, allow_pickle=True)
nb_tokens, input_len = len(word_vec), len(input_label)

#please modify the checkpoint_weight_path
#checkpoint_weight_path = '/storage1/user/ss/tmoji_ori/weight/tmoji-lstm-checkpoint-%s-h-1.hdf5' % cur_lan

#idx_shuffle = list(range(input_len))
#np.random.shuffle(idx_shuffle)
#idx_train, idx_val, idx_test = idx_shuffle[ :int(input_len*0.7) ], idx_shuffle[int(input_len*0.7):int(input_len*0.9)], idx_shuffle[int(input_len*0.9):]

train_end = int(input_len*0.7)
val_end = int(input_len*0.9)

(X_train, y_train) = (input_vec[:train_end], input_label[:train_end])
(X_val, y_val) = (input_vec[train_end:val_end], input_label[train_end:val_end])
(X_test, y_test) = (input_vec[val_end:], input_label[val_end:])

In [63]:
token2index = json.loads(open("/data/elsa/elsa_pt_vocab.txt", "r").read())

In [83]:
import math
from tqdm import tqdm
from collections import defaultdict, OrderedDict
from operator import itemgetter
from pathlib import Path
from tqdm import tqdm

In [104]:
topn = 64
def calculate_batchsize_maxlen(texts):
    """ Calculates the maximum length in the provided texts and a suitable
        batch size. Rounds up maxlen to the nearest multiple of ten.
    # Arguments:
        texts: List of inputs.
    # Returns:
        Batch size,
        max length
    """
    def roundup(x):
        return int(math.ceil(x / 10.0)) * 10

    print("calculate batch_size and maxlen")
    # Calculate max length of sequences considered
    # Adjust batch_size accordingly to prevent GPU overflow
    lengths = [len(t) for t in texts]
    maxlen = roundup(np.percentile(lengths, 80.0))
    batch_size = 250 if maxlen <= 100 else 50
    print("mean: ", np.mean(lengths), "median: ", np.median(lengths), len(lengths), "avg: ", np.average(lengths))
    print("batch_size: ", batch_size, "maxlen:", maxlen)
    return batch_size, maxlen

def most_common_emoji(emoji_freq_path, topn):
    freq = {line.split()[0]: int(line.split()[1]) for line in open(emoji_freq_path).readlines()}
    freq_topn = sorted(freq.items(), key=itemgetter(1), reverse=True)[:topn]
    emoji_topn = [token2index[freq[0]] for freq in freq_topn]
    return emoji_topn

def as_ids(tokens):
    tokens_as_id = []
    for token in tokens:
        try:
            tokens_as_id.append(token2index[token])
        except KeyError:
            pass
    return tokens_as_id

emoji_topn = most_common_emoji("/data/elsa/elsa_pt_emoji.txt", topn=topn)
emoji_topn.reverse()

# filter out of topn emoji sentences
with open("/data/elsa/elsa_pt_tokens.txt", "r") as fi:
    tidy_data = []
    for line in tqdm(fi):
        tokens = line.split()
        id_tokens = as_ids(tokens)
        if any(emoji in id_tokens for emoji in emoji_topn):
            tidy_data.append(id_tokens)

batch_size, maxlen = calculate_batchsize_maxlen(tidy_data)

X = np.zeros((len(tidy_data), maxlen), dtype='uint32')
y = []
emoji_indices = defaultdict(list)
for i, id_tokens in enumerate(tidy_data):
    each_y = np.zeros(topn)
    for token_id in id_tokens:
        try:
            emoji_index = emoji_topn.index(token_id)
            each_y[emoji_index] = 1
            break
        except ValueError:
            continue

    assert each_y.sum() == 1
    y.append(each_y)

    id_tokens = [t for t in id_tokens if t not in emoji_topn]
    X[i, :len(id_tokens)] = id_tokens[:min(maxlen, len(id_tokens))]

    emoji_indices[emoji_index].append(i)

2325142it [00:40, 57895.84it/s]


calculate batch_size and maxlen
mean:  13.232616437370055 median:  11.0 236157 avg:  13.232616437370055
batch_size:  250 maxlen: 20


In [47]:
freq = {line.split()[0]: int(line.split()[1]) for line in open("/data/elsa/elsa_pt_emoji.txt").readlines()}
freq_topn = sorted(freq.items(), key=itemgetter(1), reverse=True)[:64]
emoji_topn = [token2index[freq[0]] for freq in freq_topn]

In [115]:
emoji_topn.index(890)

18

In [116]:
emoji_indices[18]

[326, 5260, 71451]

In [112]:
token2index['🌒']

798

In [111]:
freq_topn

[('❤', 46647),
 ('😂', 42699),
 ('😍', 33172),
 ('💜', 21324),
 ('😭', 20536),
 ('🤣', 14647),
 ('🙏', 13173),
 ('💕', 11375),
 ('👏', 10599),
 ('💙', 10397),
 ('\U0001f970', 10362),
 ('♀', 9991),
 ('🤦', 9615),
 ('💛', 9437),
 ('💚', 9134),
 ('\U0001f97a', 9050),
 ('😔', 8018),
 ('🔥', 7777),
 ('💖', 7241),
 ('🎶', 7131),
 ('💗', 6429),
 ('♂', 6379),
 ('👍', 6313),
 ('🙌', 6035),
 ('🤔', 5986),
 ('\U0001f92a', 5984),
 ('♥', 5208),
 ('🤤', 5102),
 ('🙄', 4704),
 ('😅', 4623),
 ('🤷', 4553),
 ('😋', 4551),
 ('😘', 4508),
 ('💔', 4367),
 ('✨', 4276),
 ('😈', 4244),
 ('\U0001f929', 4229),
 ('🌕', 4102),
 ('🌒', 4098),
 ('🌗', 4071),
 ('🌖', 4068),
 ('😎', 4066),
 ('😡', 3958),
 ('🤧', 3756),
 ('💓', 3615),
 ('🌘', 3565),
 ('🌑', 3473),
 ('😱', 3458),
 ('🌓', 3413),
 ('🌔', 3397),
 ('💦', 3355),
 ('😉', 3255),
 ('⚠', 3226),
 ('😪', 3206),
 ('🔴', 2966),
 ('😏', 2957),
 ('😴', 2931),
 ('💪', 2890),
 ('😢', 2817),
 ('😩', 2754),
 ('💞', 2743),
 ('✌', 2720),
 ('\U0001f92d', 2557),
 ('⚽', 2545)]

In [60]:
for i in range(64):
    print(i, input_label[:, i].sum())

0 27794.0
1 19683.0
2 17921.0
3 10261.0
4 9483.0
5 6238.0
6 8194.0
7 3128.0
8 3785.0
9 3474.0
10 5792.0
11 1266.0
12 6997.0
13 1762.0
14 2101.0
15 7457.0
16 5611.0
17 4366.0
18 2782.0
19 5340.0
20 1166.0
21 860.0
22 4230.0
23 3606.0
24 4355.0
25 3784.0
26 3253.0
27 3341.0
28 3575.0
29 3285.0
30 2972.0
31 2696.0
32 2304.0
33 2799.0
34 2340.0
35 2199.0
36 2272.0
37 684.0
38 20.0
39 14.0
40 9.0
41 2470.0
42 2123.0
43 1961.0
44 557.0
45 3.0
46 44.0
47 1614.0
48 21.0
49 8.0
50 726.0
51 2545.0
52 1581.0
53 2116.0
54 1813.0
55 2068.0
56 2273.0
57 1684.0
58 1894.0
59 1775.0
60 1129.0
61 1673.0
62 1579.0
63 1301.0


In [13]:
len(set(input_label[int(input_len*0.7):int(input_len*0.9)].argmax(axis=1)))

63

In [6]:
model = elsa_architecture(nb_classes=nb_classes, nb_tokens=nb_tokens, maxlen=maxlen, final_dropout_rate=final_dropout_rate, embed_dropout_rate=embed_dropout_rate, 
                          load_embedding=True, pre_embedding=word_vec, high=high, embed_dim=embed_dim)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 200)      14168400    input_1[0][0]                    
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 20, 200)      0           embedding[0][0]                  
__________________________________________________________________________________________________
bi_lstm_0 (Bidirectio

In [7]:
if optim == 'adam':
    adam = Adam(clipnorm=1, lr=lr)
    model.compile(loss=loss, optimizer=adam, metrics=['accuracy'])
elif optim == 'rmsprop':
    model.compile(loss=loss, optimizer='rmsprop', metrics=['accuracy'])

model.fit(X_train,
          y_train,
          batch_size=batch_size,
          epochs=nb_epochs,
          validation_data=(X_val, y_val),
          callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=patience, verbose=0, mode='auto')],
          verbose=True)

#callbacks = finetuning_callbacks(checkpoint_weight_path, patience, verbose=1)
#for i in range(2):
    #train_gen = sampling_generator(X_train, y_train, batch_size, upsample=False, epoch_size=epoch_size)
    #model.fit_generator(train_gen, steps_per_epoch=steps, epochs=nb_epochs,validation_data=(X_val, y_val),validation_steps=steps, callbacks=callbacks, verbose=True)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 297553 samples, validate on 85015 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000


<keras.callbacks.History at 0x7fe204048940>

In [8]:
_, acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=0)

print(acc)

0.6024748294650976


In [15]:
token2index = json.loads(open("/data/elsa/elsa_es_vocab.txt", "r").read())

freq = {line.split()[0]: int(line.split()[1]) for line in open("/data/elsa/elsa_pt_emoji.txt").readlines()}
freq_topn = sorted(freq.items(), key=itemgetter(1), reverse=True)[:nb_classes]

In [10]:
y_pred = model.predict(X_test)

In [16]:
freq_topn

[('❤', 46647),
 ('😂', 42699),
 ('😍', 33172),
 ('💜', 21324),
 ('😭', 20536),
 ('🤣', 14647),
 ('🙏', 13173),
 ('💕', 11375),
 ('👏', 10599),
 ('💙', 10397),
 ('\U0001f970', 10362),
 ('♀', 9991),
 ('🤦', 9615),
 ('💛', 9437),
 ('💚', 9134),
 ('\U0001f97a', 9050),
 ('😔', 8018),
 ('🔥', 7777),
 ('💖', 7241),
 ('🎶', 7131),
 ('💗', 6429),
 ('♂', 6379),
 ('👍', 6313),
 ('🙌', 6035),
 ('🤔', 5986),
 ('\U0001f92a', 5984),
 ('♥', 5208),
 ('🤤', 5102),
 ('🙄', 4704),
 ('😅', 4623),
 ('🤷', 4553),
 ('😋', 4551),
 ('😘', 4508),
 ('💔', 4367),
 ('✨', 4276),
 ('😈', 4244),
 ('\U0001f929', 4229),
 ('🌕', 4102),
 ('🌒', 4098),
 ('🌗', 4071),
 ('🌖', 4068),
 ('😎', 4066),
 ('😡', 3958),
 ('🤧', 3756),
 ('💓', 3615),
 ('🌘', 3565),
 ('🌑', 3473),
 ('😱', 3458),
 ('🌓', 3413),
 ('🌔', 3397),
 ('💦', 3355),
 ('😉', 3255),
 ('⚠', 3226),
 ('😪', 3206),
 ('🔴', 2966),
 ('😏', 2957),
 ('😴', 2931),
 ('💪', 2890),
 ('😢', 2817),
 ('😩', 2754),
 ('💞', 2743),
 ('✌', 2720),
 ('\U0001f92d', 2557),
 ('⚽', 2545)]

In [16]:
len(set(np.argmax(y_test, axis=1)))

63

In [19]:
np.argmax(y_pred, axis=1)[0]

31

In [21]:
len([e[0] for e in freq_topn])

64

In [25]:
y_test.shape
y_pred.shape

(42508, 64)

In [11]:
print(classification_report(y_test.argmax(axis=1), y_pred.argmax(axis=1), target_names=[e[0] for e in freq_topn]))

ValueError: Number of classes, 63, does not match size of target_names, 64. Try specifying the labels parameter