In [2]:
import sys
sys.path.append('./')
import os
import time
import gc
import numpy as np
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Input, SpatialDropout1D
from keras.layers import LSTM, CuDNNLSTM, Activation
from keras.layers import Lambda, Embedding, Conv2D, GlobalMaxPool1D
from keras.layers import add, concatenate
from keras.layers.wrappers import TimeDistributed
from keras.models import Model, load_model
from keras.optimizers import Adagrad
from keras.constraints import MinMaxNorm
from keras.utils import to_categorical


In [3]:
import shutil
# from data import MODELS_DIR
#MODELS_DIR = 'context_vec/models'
#from context_vec.custom_layers import TimestepDropout, Camouflage, Highway, SampledSoftmax

# smi generator

In [4]:
import numpy as np
import keras


class SMIDataGenerator(keras.utils.Sequence):
    """Generates data for Keras"""

    def __len__(self):
        """Denotes the number of batches per epoch"""
        return int(np.ceil(len(self.indices)/self.batch_size))

    def __init__(self, corpus, vocab, sentence_maxlen=100, token_maxlen=50, batch_size=32, shuffle=True, token_encoding='word'):
        """Compiles a Language Model RNN based on the given parameters
        :param corpus: filename of corpus
        :param vocab: filename of vocabulary
        :param sentence_maxlen: max size of sentence
        :param token_maxlen: max size of token in characters
        :param batch_size: number of steps at each batch
        :param shuffle: True if shuffle at the end of each epoch
        :param token_encoding: Encoding of token, either 'word' index or 'char' indices
        :return: Nothing
        """

        self.corpus = corpus
        self.vocab = {line.split()[0]: int(line.split()[1]) for line in open(vocab).readlines()}
        self.sent_ids = corpus
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.sentence_maxlen = sentence_maxlen
        self.token_maxlen = token_maxlen
        self.token_encoding = token_encoding
        self.all_lines = open(corpus, mode='r', encoding="utf-8").readlines()
        with open(self.corpus) as fp:
            self.indices = np.arange(len(fp.readlines()))
            # newlines = [index for index in range(0, len(self.indices), 2)]
            # self.indices = np.delete(self.indices, newlines)

    def __getitem__(self, index):
        """Generate one batch of data"""
        # Generate indexes of the batch
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]

        # Read sample sequences
        word_indices_batch = np.zeros((len(batch_indices), self.sentence_maxlen), dtype=np.int32)
        if self.token_encoding == 'char':
            word_char_indices_batch = np.full((len(batch_indices), self.sentence_maxlen, self.token_maxlen), 260, dtype=np.int32)

        for i, batch_id in enumerate(batch_indices):
            # Read sentence (sample)
            word_indices_batch[i] = self.all_lines[batch_id].split()
            # word_indices_batch[i] = self.get_token_indices(sent_id=batch_id)
            if self.token_encoding == 'char':
                word_char_indices_batch[i] = self.get_token_char_indices(sent_id=batch_id)

        # Build forward targets
        for_word_indices_batch = np.zeros((len(batch_indices), self.sentence_maxlen), dtype=np.int32)

        padding = np.zeros((1,), dtype=np.int32)

        for i, word_seq in enumerate(word_indices_batch ):
            for_word_indices_batch[i] = np.concatenate((word_seq[1:], padding), axis=0)

        for_word_indices_batch = for_word_indices_batch[:, :, np.newaxis]

        # Build backward targets
        back_word_indices_batch = np.zeros((len(batch_indices), self.sentence_maxlen), dtype=np.int32)

        for i, word_seq in enumerate(word_indices_batch):
            back_word_indices_batch[i] = np.concatenate((padding, word_seq[:-1]), axis=0)

        back_word_indices_batch = back_word_indices_batch[:, :, np.newaxis]

        tmp = [word_indices_batch if 'word'.__eq__(self.token_encoding) else word_char_indices_batch, for_word_indices_batch, back_word_indices_batch]
        return tmp, []

    def on_epoch_end(self):
        """Updates indexes after each epoch"""
        if self.shuffle:
            np.random.shuffle(self.indices)

    def get_token_indices(self, sent_id: int):
        with open(self.corpus) as fp:
            for i, line in enumerate(fp):
                if i == sent_id:
                    token_ids = np.zeros((self.sentence_maxlen,), dtype=np.int64)
                    # Add begin of sentence index
                    token_ids[0] = self.vocab['<bos>']
                    for j, token in enumerate(line.split()[:self.sentence_maxlen - 2]):
                        # print(token)
                        if token.lower() in self.vocab:
                            token_ids[j + 1] = self.vocab[token.lower()]
                        else:
                            token_ids[j + 1] = self.vocab['<unk>']
                    # Add end of sentence index
                    if token_ids[1]:
                        token_ids[j + 2] = self.vocab['<eos>']
                    # print(token_ids)
                    return token_ids

    def get_token_char_indices(self, sent_id: int):
        def convert_token_to_char_ids(token, token_maxlen):
            bos_char = 256  # <begin sentence>
            eos_char = 257  # <end sentence>
            bow_char = 258  # <begin word>
            eow_char = 259  # <end word>
            pad_char = 260  # <pad char>
            char_indices = np.full([token_maxlen], pad_char, dtype=np.int32)
            # Encode word to UTF-8 encoding
            word_encoded = token.encode('utf-8', 'ignore')[:(token_maxlen - 2)]
            # Set characters encodings
            # Add begin of word char index
            char_indices[0] = bow_char
            if token == '<bos>':
                char_indices[1] = bos_char
                k = 1
            elif token == '<eos>':
                char_indices[1] = eos_char
                k = 1
            else:
                # Add word char indices
                for k, chr_id in enumerate(word_encoded, start=1):
                    char_indices[k] = chr_id + 1
            # Add end of word char index
            char_indices[k + 1] = eow_char
            return char_indices

        with open(self.corpus) as fp:
            for i, line in enumerate(fp):
                if i == sent_id:
                    token_ids = np.zeros((self.sentence_maxlen, self.token_maxlen), dtype=np.int32)
                    # Add begin of sentence char indices
                    token_ids[0] = convert_token_to_char_ids('<bos>', self.token_maxlen)
                    # Add tokens' char indices
                    for j, token in enumerate(line.split()[:self.sentence_maxlen - 2]):
                        token_ids[j + 1] = convert_token_to_char_ids(token, self.token_maxlen)
                    # Add end of sentence char indices
                    if token_ids[1]:
                        token_ids[j + 2] = convert_token_to_char_ids('<eos>', self.token_maxlen)
        return token_ids

# model

In [4]:
class Context_vec(object):
    def __init__(self, parameters):
        self._model = None
        self._context_vec_model = None
        self.parameters = parameters
        self.model_dir = parameters['model_dir']
        if not os.path.exists(os.path.join(MODELS_DIR, self.model_dir)):
            os.mkdir(os.path.join(MODELS_DIR, self.model_dir))
        self.compile_context_vec()

    def __del__(self):
        K.clear_session()
        del self._model

    def char_level_token_encoder(self):
        charset_size = self.parameters['charset_size']
        char_embedding_size = self.parameters['char_embedding_size']
        token_embedding_size = self.parameters['hidden_units_size']
        n_highway_layers = self.parameters['n_highway_layers']
        filters = self.parameters['cnn_filters']
        token_maxlen = self.parameters['token_maxlen']

        # Input Layer, word characters (samples, words, character_indices)
        inputs = Input(shape=(None, token_maxlen,), dtype='int32')
        # Embed characters (samples, words, characters, character embedding)
        embeds = Embedding(input_dim=charset_size, output_dim=char_embedding_size)(inputs)
        token_embeds = []
        # Apply multi-filter 2D convolutions + 1D MaxPooling + tanh
        for (window_size, filters_size) in filters:
            convs = Conv2D(filters=filters_size, kernel_size=[window_size, char_embedding_size], strides=(1, 1),
                           padding="same")(embeds)
            convs = TimeDistributed(GlobalMaxPool1D())(convs)
            convs = Activation('tanh')(convs)
            convs = Camouflage(mask_value=0)(inputs=[convs, inputs])
            token_embeds.append(convs)
        token_embeds = concatenate(token_embeds)
        
        # Apply highways networks
        for i in range(n_highway_layers):
            token_embeds = TimeDistributed(Highway())(token_embeds)
            token_embeds = Camouflage(mask_value=0)(inputs=[token_embeds, inputs])
        # Project to token embedding dimensionality
        token_embeds = TimeDistributed(Dense(units=token_embedding_size, activation='linear'))(token_embeds)
        token_embeds = Camouflage(mask_value=0)(inputs=[token_embeds, inputs])

        token_encoder = Model(inputs=inputs, outputs=token_embeds, name='token_encoding')
        return token_encoder

    def compile_context_vec(self, print_summary=False):
        """
        Compiles a Language Model RNN based on the given parameters
        """

        if self.parameters['token_encoding'] == 'word':
            # Train word embeddings from scratch
            word_inputs = Input(shape=(None,), name='word_indices', dtype='int32')
            embeddings = Embedding(self.parameters['vocab_size'], self.parameters['hidden_units_size'], trainable=True, name='token_encoding')
            inputs = embeddings(word_inputs)

            # Token embeddings for Input
            drop_inputs = SpatialDropout1D(self.parameters['dropout_rate'])(inputs)
            lstm_inputs = TimestepDropout(self.parameters['word_dropout_rate'])(drop_inputs)

            # Pass outputs as inputs to apply sampled softmax
            next_ids = Input(shape=(None, 1), name='next_ids', dtype='float32')
            previous_ids = Input(shape=(None, 1), name='previous_ids', dtype='float32')
        elif self.parameters['token_encoding'] == 'char':
            # Train character-level representation
            word_inputs = Input(shape=(None, self.parameters['token_maxlen'],), dtype='int32', name='char_indices')
            inputs = self.char_level_token_encoder()(word_inputs)

            # Token embeddings for Input
            drop_inputs = SpatialDropout1D(self.parameters['dropout_rate'])(inputs)
            lstm_inputs = TimestepDropout(self.parameters['word_dropout_rate'])(drop_inputs)

            # Pass outputs as inputs to apply sampled softmax
            next_ids = Input(shape=(None, 1), name='next_ids', dtype='float32')
            previous_ids = Input(shape=(None, 1), name='previous_ids', dtype='float32')

        # Reversed input for backward LSTMs
        re_lstm_inputs = Lambda(function=Context_vec.reverse)(lstm_inputs)
        mask = Lambda(function=Context_vec.reverse)(drop_inputs)

        # Forward LSTMs
        for i in range(self.parameters['n_lstm_layers']):
            lstm = LSTM(units=self.parameters['lstm_units_size'], return_sequences=True, activation="tanh",
                        recurrent_activation='sigmoid',
                        kernel_constraint=MinMaxNorm(-1 * self.parameters['cell_clip'],
                                                     self.parameters['cell_clip']),
                        recurrent_constraint=MinMaxNorm(-1 * self.parameters['cell_clip'],
                                                        self.parameters['cell_clip'])
                        )(lstm_inputs)
            lstm = Camouflage(mask_value=0)(inputs=[lstm, drop_inputs])
            # Projection to hidden_units_size
            proj = TimeDistributed(Dense(self.parameters['hidden_units_size'], activation='linear',
                                         kernel_constraint=MinMaxNorm(-1 * self.parameters['proj_clip'],
                                                                      self.parameters['proj_clip'])
                                         ))(lstm)
            # Merge Bi-LSTMs feature vectors with the previous ones
            lstm_inputs = add([proj, lstm_inputs], name='f_block_{}'.format(i + 1))
            # Apply variational drop-out between BI-LSTM layers
            lstm_inputs = SpatialDropout1D(self.parameters['dropout_rate'])(lstm_inputs)

        # Backward LSTMs
        for i in range(self.parameters['n_lstm_layers']):
            re_lstm = LSTM(units=self.parameters['lstm_units_size'], return_sequences=True, activation='tanh',
                           recurrent_activation='sigmoid',
                           kernel_constraint=MinMaxNorm(-1 * self.parameters['cell_clip'],
                                                        self.parameters['cell_clip']),
                           recurrent_constraint=MinMaxNorm(-1 * self.parameters['cell_clip'],
                                                           self.parameters['cell_clip'])
                           )(re_lstm_inputs)
            re_lstm = Camouflage(mask_value=0)(inputs=[re_lstm, mask])
            # Projection to hidden_units_size
            re_proj = TimeDistributed(Dense(self.parameters['hidden_units_size'], activation='linear',
                                            kernel_constraint=MinMaxNorm(-1 * self.parameters['proj_clip'],
                                                                         self.parameters['proj_clip'])
                                            ))(re_lstm)
            # Merge Bi-LSTMs feature vectors with the previous ones
            re_lstm_inputs = add([re_proj, re_lstm_inputs], name='b_block_{}'.format(i + 1))
            # Apply variational drop-out between BI-LSTM layers
            re_lstm_inputs = SpatialDropout1D(self.parameters['dropout_rate'])(re_lstm_inputs)

        # Reverse backward LSTMs' outputs = Make it forward again
        re_lstm_inputs = Lambda(function=Context_vec.reverse, name="reverse")(re_lstm_inputs)

        # Project to Vocabulary with Sampled Softmax
        sampled_softmax = SampledSoftmax(num_classes=self.parameters['vocab_size'],
                                         num_sampled=int(self.parameters['num_sampled']),
                                         tied_to=embeddings if self.parameters['weight_tying']
                                         and self.parameters['token_encoding'] == 'word' else None)
        outputs = sampled_softmax([lstm_inputs, next_ids])
        re_outputs = sampled_softmax([re_lstm_inputs, previous_ids])

        self._model = Model(inputs=[word_inputs, next_ids, previous_ids],
                            outputs=[outputs, re_outputs])
        self._model.compile(optimizer=Adagrad(lr=self.parameters['lr'], clipvalue=self.parameters['clip_value']),
                            loss=None)
        if print_summary:
            self._model.summary()

    def train(self, train_data, valid_data):

        # Add callbacks (early stopping, model checkpoint)
        weights_file = os.path.join(MODELS_DIR, self.model_dir, "context_vec_best_weights_{epoch:03d}_{val_loss:.2f}.hdf5")
        save_best_model = ModelCheckpoint(filepath=weights_file, monitor='val_loss', verbose=1,
                                          save_best_only=False, mode='auto')
        # early_stopping = EarlyStopping(patience=self.parameters['patience'], restore_best_weights=True)

        t_start = time.time()

        # Fit Model
        self._model.fit_generator(train_data,
                                  validation_data=valid_data,
                                  epochs=self.parameters['epochs'],
                                  workers=self.parameters['n_threads']
                                  if self.parameters['n_threads'] else os.cpu_count(),
                                  use_multiprocessing=True
                                  if self.parameters['multi_processing'] else False,
                                  callbacks=[save_best_model])

        print('Training took {0} sec'.format(str(time.time() - t_start)))

    def evaluate(self, test_data, batch_size):
        # 查找完整句子
        def unpad(x, y_true, y_pred):
            y_true_unpad = []
            y_pred_unpad = []
            for i, x_i in enumerate(x):
                for j, x_ij in enumerate(x_i):
                    if x_ij == 0:
                        y_true_unpad.append(y_true[i][:j])
                        y_pred_unpad.append(y_pred[i][:j])
                        break
            return np.asarray(y_true_unpad), np.asarray(y_pred_unpad)

        # # Generate samples
        # x, y_true_forward, y_true_backward = [], [], []
        # for i in range(len(test_data)):
        #     test_batch = test_data[i][0]
        #     x.extend(test_batch[0])
        #     y_true_forward.extend(test_batch[1])
        #     y_true_backward.extend(test_batch[2])
        # x = np.asarray(x)
        # y_true_forward = np.asarray(y_true_forward)
        # y_true_backward = np.asarray(y_true_backward)

        # Generate samples
        # x, y_true_forward, y_true_backward = [], [], []
        # y_pred_forward, y_pred_backward = [], []
        # for i in range(len(test_data)):
        for i in range(488):
            test_batch = test_data[i][0]

            # Predict outputs
            y_pred_forward_tmp, y_pred_backward_tmp = self._model.predict([test_batch[0], test_batch[1], test_batch[2]])

            # Unpad sequences
            y_true_forward, y_pred_forward = unpad(test_batch[0], test_batch[1], y_pred_forward_tmp)
            y_true_backward, y_pred_backward = unpad(test_batch[0], test_batch[2], y_pred_backward_tmp)

            # Compute and print perplexity
            # print('{}， Forward Langauge Model Perplexity: {}'.format(i, context_vec.perplexity(y_pred_forward, y_true_forward)))
            # print('{}， Backward Langauge Model Perplexity: {}'.format(i, context_vec.perplexity(y_pred_backward, y_true_backward)))
            print('{}， avg {}'.format(i,
                                      (Context_vec.perplexity(y_pred_backward, y_true_backward) + Context_vec.perplexity(y_pred_forward, y_true_forward)) / 2
                                      ))
            del test_batch, y_true_backward, y_pred_backward, \
                y_pred_forward_tmp, y_pred_backward_tmp, y_true_forward, y_pred_forward
            gc.collect()
            # x.extend(test_batch[0])
        #     y_true_forward.extend(test_batch[1])
        #     y_true_backward.extend(test_batch[2])
        #
        #     y_pred_forward.extend(y_pred_forward_tmp)
        #     y_pred_backward.extend(y_pred_backward_tmp)
        #
        # x = np.asarray(x)
        # y_true_forward = np.asarray(y_true_forward)
        # y_true_backward = np.asarray(y_true_backward)
        #
        # y_pred_forward = np.asarray(y_pred_forward)
        # y_pred_backward = np.asarray(y_pred_backward)

        # # Unpad sequences
        # y_true_forward, y_pred_forward = unpad(x, y_true_forward, y_pred_forward)
        # y_true_backward, y_pred_backward = unpad(x, y_true_backward, y_pred_backward)



    def wrap_multi_context_vec_encoder(self, print_summary=False, save=False):
        """
        Wrap context_vec meta-model encoder, which returns an array of the 3 intermediate context_vec outputs
        :param print_summary: print a summary of the new architecture
        :param save: persist model
        :return: None
        """

        context_vec_embeddings = list()
        context_vec_embeddings.append(concatenate([self._model.get_layer('token_encoding').output, self._model.get_layer('token_encoding').output],
                                           name='context_vec_embeddings_level_0'))
        for i in range(self.parameters['n_lstm_layers']):
            context_vec_embeddings.append(concatenate([self._model.get_layer('f_block_{}'.format(i + 1)).output,
                                                Lambda(function=Context_vec.reverse)
                                                (self._model.get_layer('b_block_{}'.format(i + 1)).output)],
                                               name='context_vec_embeddings_level_{}'.format(i + 1)))

        camos = list()
        for i, context_vec_embedding in enumerate(context_vec_embeddings):
            camos.append(Camouflage(mask_value=0.0, name='camo_context_vec_embeddings_level_{}'.format(i + 1))([context_vec_embedding,
                                                                                                         self._model.get_layer(
                                                                                                             'token_encoding').output]))

        self._context_vec_model = Model(inputs=[self._model.get_layer('word_indices').input], outputs=camos)

        if print_summary:
            self._context_vec_model.summary()

        if save:
            self._context_vec_model.save(os.path.join(MODELS_DIR, self.model_dir, 'context_vec_Encoder.hd5'))
            print('context_vec Encoder saved successfully')

    def save(self, sampled_softmax=True, model_dir="model"):
        """
        Persist model in disk
        :param sampled_softmax: reload model using the full softmax function
        :return: None
        """
        if not sampled_softmax:
            self.parameters['num_sampled'] = self.parameters['vocab_size']
        self.compile_context_vec()
        best_name = ""
        min_loss = 100000
        for file_name in os.listdir(os.path.join(MODELS_DIR, self.model_dir)):
            if "0." in file_name:
                tmp_loss = float(file_name.split("_")[-1].split(".")[-2])
                if tmp_loss < min_loss:
                    min_loss = tmp_loss
                    best_name = file_name
        shutil.copyfile(os.path.join(MODELS_DIR, self.model_dir, best_name),
                        os.path.join(MODELS_DIR, self.model_dir, 'context_vec_best_weights.hdf5'))
        print(" best model name is :" + best_name)
        self._model.load_weights(os.path.join(MODELS_DIR, self.model_dir, 'context_vec_best_weights.hdf5'))
        self._model.save(os.path.join(MODELS_DIR, model_dir, 'context_vec_LM_EVAL.hd5'))
        print('context_vec Language Model saved successfully')

    def load(self, sampled_softmax=False):
        if not sampled_softmax:
            self.parameters['num_sampled'] = self.parameters['vocab_size']
        self.compile_context_vec()
        self._model.load_weights(os.path.join(MODELS_DIR, self.model_dir, 'context_vec_best_weights.hdf5'))
        # self._model = load_model(os.path.join(MODELS_DIR, self.model_dir, 'context_vec_Encoder.hd5'),
        #                          custom_objects={'TimestepDropout': TimestepDropout,
        #                                          'Camouflage': Camouflage})

    def load_context_vec_encoder(self):
        self._context_vec_model = load_model(os.path.join(MODELS_DIR, self.model_dir, 'context_vec_Encoder.hd5'),
                                      custom_objects={'TimestepDropout': TimestepDropout,
                                                      'Camouflage': Camouflage})

    def get_outputs(self, test_data, output_type='word', state='last'):
        """
       Wrap context_vec meta-model encoder, which returns an array of the 3 intermediate context_vec outputs
       :param test_data: data generator
       :param output_type: "word" for word vectors or "sentence" for sentence vectors
       :param state: 'last' for 2nd LSTMs outputs or 'mean' for mean-pooling over inputs, 1st LSTMs and 2nd LSTMs
       :return: None
       """
        # Generate samples
        preds1 = []
        # for i in range(len(test_data)):
        for i in range(10):
            x = test_data[i][0]
            # print(x[0])
            tmp = np.asarray(self._context_vec_model.predict(np.asarray(x[0]))).swapaxes(0,1)
            preds1.extend(tmp)
            del tmp

        preds = np.array(preds1)
        del preds1
        if state == 'last':
            context_vec_vectors = preds[:,-1,:,:]
        elif state == 'all':
            context_vec_vectors = preds
        else:
            context_vec_vectors = np.mean(preds, axis=1)

        if output_type == 'word':
            return context_vec_vectors
        else:
            return np.mean(context_vec_vectors, axis=2)

    def get_outputs_Bylist(self, test_data, output_typeBy='word', state='last'):
        """
       Wrap context_vec meta-model encoder, which returns an array of the 3 intermediate context_vec outputs
       :param test_data: data generator
       :param output_type: "word" for word vectors or "sentence" for sentence vectors
       :param state: 'last' for 2nd LSTMs outputs or 'mean' for mean-pooling over inputs, 1st LSTMs and 2nd LSTMs
       :return: None
       """
        # Generate samples
        preds = []
        for i in range(len(test_data)):
            x = test_data[i][0]
            # print(x[0])
            tmp = np.asarray(self._context_vec_model.predict(np.asarray(x[0]))).swapaxes(0,1)
            preds.extend(tmp)

        return preds


    def get_predict(self, test_data, output_type='word', state='last'):
        # x = []
        # for i in range(len(test_data)):
        #     test_batch = test_data[0]
        #     x.extend(test_batch[0])

        preds = np.asarray(self._context_vec_model.predict(np.asarray(test_data)))
        if state == 'last':
            context_vec_vectors = preds[-1]
        else:
            context_vec_vectors = np.mean(preds, axis=0)

        if output_type == 'words':
            return context_vec_vectors
        else:
            return np.mean(context_vec_vectors, axis=1)

        
    @staticmethod
    def reverse(inputs, axes=1):
        return K.reverse(inputs, axes=axes)

    @staticmethod
    def perplexity(y_pred, y_true):
        if len(y_pred) == 0 or len(y_true) == 0:
            return -1
        cross_entropies = []
        for y_pred_seq, y_true_seq in zip(y_pred, y_true):
            # Reshape targets to one-hot vectors
            y_true_seq = to_categorical(y_true_seq, y_pred_seq.shape[-1])
            # Compute cross_entropy for sentence words
            cross_entropy = K.categorical_crossentropy(K.tf.convert_to_tensor(y_true_seq, dtype=K.tf.float32),
                                                       K.tf.convert_to_tensor(y_pred_seq, dtype=K.tf.float32))
            cross_entropies.extend(cross_entropy.eval(session=K.get_session()))

        # Compute mean cross_entropy and perplexity
        cross_entropy = np.mean(np.asarray(cross_entropies), axis=-1)

        return pow(2.0, cross_entropy)

# get_tox_data

In [5]:
import sys
sys.path.append('./')
import pandas as pd
from sklearn.externals import joblib
import numpy as np
import os

# NR-AR	NR-AR-LBD	NR-AhR	NR-Aromatase
# NR-ER	NR-ER-LBD	NR-PPAR-gamma	SR-ARE
# SR-ATAD5	SR-HSE	SR-MMP	SR-p53
dict_label = {"NR-AR":0,
              "NR-AR-LBD":1,
              "NR-AhR":2,
              "NR-Aromatase":3,
              "NR-ER":4,
              "NR-ER-LBD":5,
              "NR-PPAR-gamma":6,
              "SR-ARE":7,
              "SR-ATAD5":8,
              "SR-HSE":9,
              "SR-MMP":10,
              "SR-p53":11,}


# step 1
# tox21.csv에서 smiles 문자열 추출
filepath="C:/Users/Administrator/Desktop/tox21/tox21.csv"
df = pd.read_csv(filepath, header=0, encoding="gbk")
all_label = []
all_smi = []
w_file = open("tox/tox.smi", mode='w',encoding="utf-8")
for line in df.values:
    smi = line[13].strip()
    if len(smi) <= 0:
        break
    all_label.append(line[:12])
    all_smi.append(smi)
    w_file.write(smi+"\n")
w_file.close()


# step 2
#mol2vec 임베딩 생성
#mol2vec 패키지로 smiles문자열 -> mol2vec 임베딩으로 변환.??
adb = "mol2vec corpus -i tox/tox.smi -o tox/tox.cp -r 1 -j 4 --uncommon UNK --threshold 3"
d = os.popen(adb)
f = d.read()
print(f)


# step 3
#train한 vocab
vocab_path = "data/datasets/my_smi/smi_tran.vocab"
vocab = {line.split()[0]: int(line.split()[1]) for line in open(vocab_path).readlines()}

sentence_maxlen = 80

w_file = open("tox/tox_tran.cp_UNK", mode='w', encoding="utf-8")
label = []
smi = []
index = -1
mols_path = "tox/tox.cp_UNK"
mols_file = open(mols_path, mode='r',encoding="utf-8")
while True:
    line = mols_file.readline().strip()
    index += 1
    if "None".__eq__(line.strip()) or "UNK".__eq__(line.strip()):
        continue
    if not line:
        break
    token_ids = np.zeros((sentence_maxlen,), dtype=np.int64)
    # Add begin of sentence index
    token_ids[0] = vocab['<bos>']
    for j, token in enumerate(line.split()[:sentence_maxlen - 2]):
        # print(token)
        if token.lower() in vocab:
            token_ids[j + 1] = vocab[token.lower()]
        else:
            token_ids[j + 1] = vocab['<unk>']
    # Add end of sentence index
    if token_ids[1]:
        token_ids[j + 2] = vocab['<eos>']
    # print(token_ids)
    label.append(all_label[index])
    smi.append(all_smi[index])
    w_file.write(" ".join(str(i) for i in token_ids).strip()+"\n")
w_file.close()

joblib.dump(label, "tox/label.pkl")
joblib.dump(smi, "tox/smi.pkl")

# step 4
import os
import keras.backend as K
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# from data import DATA_SET_DIR
from context_vec.smi_generator import SMIDataGenerator
from context_vec.smi_model import context_vec
import tensorflow as tf
from tensorflow import keras
from sklearn.externals import joblib

config = tf.ConfigProto()
config.gpu_options.allow_growth = False
sess = tf.Session(config=config)
keras.backend.set_session(sess)

parameters = {
    'multi_processing': False,
    'n_threads': 4,
    'cuDNN': False,
    'test_dataset': "tox/tox_tran.cp_UNK",
    'vocab': 'my_smi/smi_tran.vocab',
    'model_dir': "smi_context_vec_best",
    'vocab_flag': False,
    'uncommon_threshold': 3,
    # 'vocab_size': 28914,
    # 'vocab_size': 748,
    'vocab_size': 13576,
    # 'vocab_size': 121,
    'num_sampled': 100,
    # 'charset_size': 262,
    'sentence_maxlen': 80,
    'token_maxlen': 50,
    'token_encoding': 'word',
    'epochs': 1000,
    'patience': 2,
    'batch_size': 512,
    'test_batch_size': 512,
    'clip_value': 1,
    'cell_clip': 5,
    'proj_clip': 5,
    'lr': 0.2,
    'shuffle': False,
    'n_lstm_layers': 2,
    'n_highway_layers': 2,
    'cnn_filters': [[1, 32],
                    [2, 32],
                    [3, 64],
                    [4, 128],
                    [5, 256],
                    [6, 512],
                    [7, 512]
                    ],
    'lstm_units_size': 300,
    'hidden_units_size': 150,
    'char_embedding_size': 16,
    'dropout_rate': 0.1,
    'word_dropout_rate': 0.05,
    'weight_tying': True,
}

test_generator = SMIDataGenerator(parameters['test_dataset'],
                                os.path.join("data/datasets", parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['test_batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

# Compile context_vec
context_vec_model = context_vec(parameters)
context_vec_model.compile_context_vec()

# context_vec_model.load(sampled_softmax=False)
#
# # Evaluate Bidirectional Language Model
# context_vec_model.evaluate(test_generator, parameters['test_batch_size'])
#
# # Build context_vec meta-model to deploy for production and persist in disk
# context_vec_model.wrap_multi_context_vec_encoder(print_summary=True)

# Load context_vec encoder
context_vec_model.load_context_vec_encoder()

# Get context_vec embeddings to feed as inputs for downstream tasks
context_vec_embeddings = context_vec_model.get_outputs(test_generator, output_type='word', state='all')
print(context_vec_embeddings.shape)

# 保存x
joblib.dump(context_vec_embeddings, "tox/tox_embed.pkl")

ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\Administrator\anaconda3\lib\site-packages\sklearn\externals\__init__.py)

# tox train

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torch.utils.data as data
import pandas as pd
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_curve
import numpy as np
import math
import random
from sklearn import metrics
# from utils.util import *
# from utils.model import *
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTM(nn.Module):
    """搭建rnn网络"""
    def __init__(self, out_num, input_size=300, task_type='sing', att=False):
        super(LSTM, self).__init__()
        self.matrix = nn.Parameter(torch.tensor([0.33, 0.33, 0.33]), requires_grad=True)
        self.input_size = input_size
        self.out_num = out_num * 2 if "muti".__eq__(task_type) else out_num
        self.att = att

        self.fc = nn.Linear(self.input_size, 1024)
        self.lstm = nn.LSTM(
            input_size=1024,
            hidden_size=1024,
            num_layers=2,
            batch_first=True,)
            # bidirectional=True)
        # self.fc1 = nn.Linear(512, 1024)
        # self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, self.out_num)
        self.dropout = nn.Dropout(p=0.3)
        # self.sig = nn.Sigmoid()
        # self.bn1 = nn.BatchNorm1d(1024)
        # self.bn2 = nn.BatchNorm1d(512)
        # self.bn3 = nn.BatchNorm1d(128)

    def attention_net(self, x, query, mask=None):
        d_k = query.size(-1)  # d_k为query的维度

        # query:[batch, seq_len, hidden_dim*2], x.t:[batch, hidden_dim*2, seq_len]
        #         print("query: ", query.shape, x.transpose(1, 2).shape)  # torch.Size([128, 38, 128]) torch.Size([128, 128, 38])
        # 打分机制 scores: [batch, seq_len, seq_len]
        scores = torch.matmul(query, x.transpose(1, 2)) / math.sqrt(d_k)
        #         print("score: ", scores.shape)  # torch.Size([128, 38, 38])

        # 对最后一个维度 归一化得分
        alpha_n = F.softmax(scores, dim=-1)
        #         print("alpha_n: ", alpha_n.shape)    # torch.Size([128, 38, 38])
        # 对权重化的x求和
        # [batch, seq_len, seq_len]·[batch,seq_len, hidden_dim*2] = [batch,seq_len,hidden_dim*2] -> [batch, hidden_dim*2]
        context = torch.matmul(alpha_n, x).sum(1)

        return context, alpha_n

    def forward(self, x):
        # bs = len(x)
        # length = np.array([t.shape[0] for t in x])
        #
        # x, orderD = pack_sequences(x)
        # print(self.matrix[0],self.matrix[1],self.matrix[2])
        x = x.to(device)
        x = self.matrix[0] * x[:, 0, :, :] + self.matrix[1] * x[:, 1, :, :] + self.matrix[2] * x[:, 2, :, :]
        x = self.fc(x.to(device)).to(device)
        # changed_length1 = length[orderD]
        # x = pack_padded_sequence(x, changed_length1, batch_first=True)

        out,(h_n, c_n) = self.lstm(x.to(device))     #h_state是之前的隐层状态
        # out = torch.cat((h_n[-1, :, :], h_n[-2, :, :]), dim=-1)
        # out1 = unpack_sequences(rnn_out, orderD)
        # for i in range(bs):
        #     out1[i,length[i]:-1,:] = 0

        if self.att:
            query = self.dropout(out)

            # 加入attention机制
            out, alpha_n = self.attention_net(out, query)

        else:
            out = torch.mean(out,dim=1).squeeze().cuda()
            # out = out[:,-1,:]


        #进行全连接
        # out = self.fc1(out[:,-1,:])
        # out = F.relu(out)
        # out = self.bn1(F.dropout(out, p=0.3))
        # out = self.fc2(out)
        # out = F.relu(out)
        # out = self.bn2(F.dropout(out, p=0.3))
        out = self.fc3(out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.fc4(out)
        # return F.softmax(out,dim=-1)
        return out

class MyDataset(data.Dataset):
    def __init__(self, compound, y, smi):
        super(MyDataset, self).__init__()
        self.compound = compound
        # self.compound = torch.FloatTensor(compound)
        # self.y = torch.FloatTensor(y)
        self.y = y
        self.smi = smi

    def __getitem__(self, item):
        return self.compound[item], self.y[item], self.smi[item]


    def __len__(self):
        return len(self.compound)

def split_multi_label(x, y, smi, k_fold, name):
    y = np.array(y).astype(float)
    all_smi = np.array(smi)
    # save_path = 'tox/'+str(k_fold)+'-fold-index.pkl'
    # if os.path.isfile(save_path):
    #     index = joblib.load(save_path)
    #     train_split_x = x[index["train_index"]]
    #     train_split_y = y[index["train_index"]]
    #     val_split_x = x[index["val_index"]]
    #     val_split_y = y[index["val_index"]]
    #     test_split_x = x[index["test_index"]]
    #     test_split_y = y[index["test_index"]]
    #     train_weights = joblib.load('tox/train_weights.pkl')
    #     return train_split_x, train_split_y, val_split_x, val_split_y, test_split_x, test_split_y, train_weights

    kf = KFold(5, False, 100)
    all_train_index = [[],[],[],[],[]]
    all_train_index_weights = [[] for i in range(y.shape[1])]
    all_val_index = [[],[],[],[],[]]
    all_test_index = [[],[],[],[],[]]
    for task_index in range(y.shape[-1]):
        negative_index = np.where(y[:, task_index] == 0)[0]
        positive_index = np.where(y[:, task_index] == 1)[0]
        train_index = [[],[],[],[],[]]
        val_index = [[],[],[],[],[]]
        test_index = [[],[],[],[],[]]
        for k, tmp in enumerate(kf.split(negative_index)):
            # train_tmp is  the index ofnegative_index
            train_tmp, test_tmp = tmp
            train_index[k].extend(negative_index[train_tmp])
            num_t = int(len(test_tmp)/2)
            val_index[k].extend(negative_index[test_tmp[:num_t]])
            test_index[k].extend(negative_index[test_tmp[num_t:]])
        for k, tmp in enumerate(kf.split(positive_index)):
            train_tmp, test_tmp = tmp
            train_index[k].extend(positive_index[train_tmp])
            num_t = int(len(test_tmp)/2)
            val_index[k].extend(positive_index[test_tmp[:num_t]])
            test_index[k].extend(positive_index[test_tmp[num_t:]])

        all_train_index_weights[task_index] = [(len(negative_index) + len(positive_index)) / len(negative_index),
                                               (len(negative_index) + len(positive_index)) / len(positive_index)]

        if task_index == 0:
            all_train_index = train_index
            all_val_index = val_index
            all_test_index = test_index
        else:
            all_train_index = [list(set(all_train_index[i]).union(set(t))) for i, t in enumerate(train_index)]
            all_val_index = [list(set(all_val_index[i]).union(set(t))) for i, t in enumerate(val_index)]
            all_test_index = [list(set(all_test_index[i]).union(set(t))) for i, t in enumerate(test_index)]
    for i in range(5):
        joblib.dump({"train_index":all_train_index[i],
                     "val_index": all_val_index[i],
                     "test_index": all_test_index[i],
                     }, name+'/'+str(i+1)+'-fold-index.pkl')
    joblib.dump(all_train_index_weights, name+'/weights.pkl')
    train_split_x = x[all_train_index[k_fold]]
    train_split_y = y[all_train_index[k_fold]]
    train_split_smi = all_smi[all_train_index[k_fold]]
    val_split_x = x[all_val_index[k_fold]]
    val_split_y = y[all_val_index[k_fold]]
    val_split_smi = all_smi[all_val_index[k_fold]]
    test_split_x = x[all_test_index[k_fold]]
    test_split_y = y[all_test_index[k_fold]]
    test_split_smi = all_smi[all_test_index[k_fold]]
    return train_split_x, train_split_y, train_split_smi,\
           val_split_x, val_split_y, val_split_smi,\
           test_split_x, test_split_y, test_split_smi, all_train_index_weights


if __name__ == '__main__':
    # 设置超参数
    input_size = 512
    hidden_size = 512  # 定义超参数rnn的循环神经元个数，个数为32个
    learning_rate = 0.01  # 定义超参数学习率
    epoch_num = 2000
    batch_size = 128
    best_loss = 10000
    test_best_loss = 10000
    weight_decay = 1e-5
    momentum = 0.9

    b = 0.2
    dict_label = {"NR-AR": 0,
                  "NR-AR-LBD": 1,
                  "NR-AhR": 2,
                  "NR-Aromatase": 3,
                  "NR-ER": 4,
                  "NR-ER-LBD": 5,
                  "NR-PPAR-gamma": 6,
                  "SR-ARE": 7,
                  "SR-ATAD5": 8,
                  "SR-HSE": 9,
                  "SR-MMP": 10,
                  "SR-p53": 11, }
    tasks = list(dict_label.keys())
    tasks_num = len(tasks)

    seed = 188
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    y = joblib.load("tox/label.pkl")
    y = np.array(y).astype(float)
    print(y.shape)
    all_smi = joblib.load("tox/smi.pkl")

    x = joblib.load("tox/tox_embed.pkl")

    # 5-Fold
    train_split_x, train_split_y, train_split_smi, \
    val_split_x, val_split_y, val_split_smi, \
    test_split_x, test_split_y, test_split_smi, weights = split_multi_label(x, y, all_smi, 3, 'tox')

    data_train = MyDataset(train_split_x, train_split_y, train_split_smi)
    dataset_train = data.DataLoader(dataset=data_train, batch_size=batch_size, shuffle=True)

    data_val = MyDataset(val_split_x, val_split_y, val_split_smi)
    dataset_val = data.DataLoader(dataset=data_val, batch_size=batch_size, shuffle=True)

    data_test = MyDataset(test_split_x, test_split_y, test_split_smi)
    dataset_test = data.DataLoader(dataset=data_test, batch_size=batch_size, shuffle=True)

    rnn = LSTM(tasks_num, task_type="muti", input_size=300).to(device)
    # 设置优化器和损失函数
    #使用adam优化器进行优化，输入待优化参数rnn.parameters，优化学习率为learning_rate
    optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate, weight_decay=weight_decay,
                                momentum=momentum)
    # optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate, weight_decay=weight_decay)
    # optimizer = torch.optim.Adadelta(rnn.parameters(), lr=learning_rate, weight_decay = weight_decay, rho=0.9)
    # optimizer = torch.optim.RMSprop(rnn.parameters(), lr=learning_rate, weight_decay = weight_decay)

    # loss_function = F.cross_entropy
    # loss_function = F.nll_loss
    loss_function = [nn.CrossEntropyLoss(torch.Tensor(weight).to(device), reduction='mean') for weight in weights]
    # loss_function = nn.BCELoss()
    # loss_function = nn.BCEWithLogitsLoss()

    # 按照以下的过程进行参数的训练
    for epoch in range(epoch_num):
        avg_loss = 0
        sum_loss = 0
        rnn.train()
        y_true_task = {}
        y_pred_task = {}
        y_pred_task_score = {}
        for index, tmp in enumerate(dataset_train):
            tmp_compound, tmp_y, tmp_smi = tmp
            # tmp_y = tmp_y.float()
            optimizer.zero_grad()
            outputs = rnn(tmp_compound.to(device))
            loss = 0
            for i in range(len(tasks)):
                validId = np.where((tmp_y[:, i].cpu().numpy() == 0) | (tmp_y[:, i].cpu().numpy() == 1))[0]
                if len(validId) == 0:
                    continue
                y_pred = outputs[:, i * 2:(i + 1) * 2][torch.tensor(validId).to(device)]
                y_label = tmp_y[:, i][torch.tensor(validId).to(device)]

                # y_pred = torch.sigmoid(y_pred).view(-1)
                # y_label = F.one_hot(y_label, 2).float().to(device)
                loss += loss_function[i](y_pred.to(device), y_label.long().to(device))

                pred_lable = F.softmax(y_pred.detach().cpu(), dim=-1)[:, 1].view(-1).numpy()
                # pred_lable = np.zeros_like(y_pred.cpu().detach().numpy(), dtype=int)
                # pred_lable[np.where(np.asarray(y_pred.cpu().detach().numpy()) > 0.5)] = 1
                try:
                    y_true_task[i].extend(y_label.cpu().numpy())
                    y_pred_task[i].extend(pred_lable)
                    # y_pred_task_score[i].extend(y_pred)
                except:
                    y_true_task[i] = []
                    y_pred_task[i] = []
                    # y_pred_task_score[i] = []
                    y_true_task[i].extend(y_label.cpu().numpy())
                    y_pred_task[i].extend(pred_lable)
                    # y_pred_task_score[i].extend(y_pred.cpu().detach().numpy())

                # flood = (loss - b).abs() + b

            loss.backward()
            optimizer.step()

            sum_loss += loss
            # print("epoch:", epoch, "index: ", index,"loss:", loss.item())
        avg_loss = sum_loss / (index + 1)
        # cm = [metrics.confusion_matrix(y_true_task[i], y_pred_task[i]) for i in range(len(tasks))]
        trn_roc = [metrics.roc_auc_score(y_true_task[i], y_pred_task[i]) for i in range(len(tasks))]
        trn_prc = [metrics.auc(precision_recall_curve(y_true_task[i], y_pred_task[i])[1],
                               precision_recall_curve(y_true_task[i], y_pred_task[i])[0]) for i in range(len(tasks))]
        # acc = [metrics.accuracy_score(y_true_task[i], y_pred_task[i]) for i in range(len(tasks))]
        # recall = [metrics.recall_score(y_true_task[i], y_pred_task[i]) for i in range(len(tasks))]
        # specificity = [cm[i][0, 0] / (cm[i][0, 0] + cm[i][0, 1]) for i in range(len(tasks))]

        print("epoch:", epoch, "   train  "  "avg_loss:", avg_loss.item(),
              # "acc: ", np.array(acc).mean(),
              # "recall: ", np.array(recall).mean(),
              # "specificity: ", np.array(specificity).mean(),
              " train_auc: ", np.array(trn_roc).mean(),
              " train_pr: ", np.array(trn_prc).mean())

        with torch.no_grad():
            rnn.eval()
            val_sum_loss = []
            y_true_task = {}
            y_pred_task = {}
            y_pred_task_score = {}
            for index, tmp in enumerate(dataset_val):
                tmp_compound, tmp_y, tmp_smi = tmp
                loss = 0
                outputs = rnn(tmp_compound)
                # out_label = F.softmax(outputs, dim=1)
                # pred = out_label.data.max(1, keepdim=True)[1].view(-1).cpu().numpy()
                # pred_score = [x[tmp_y.cpu().detach().numpy()[i]] for i, x in enumerate(out_label.cpu().detach().numpy())]
                # y_pred.extend(pred)
                # y_pred_score.extend(pred_score)
                for i in range(tasks_num):
                    validId = np.where((tmp_y[:, i].cpu().numpy() == 0) | (tmp_y[:, i].cpu().numpy() == 1))[0]
                    if len(validId) == 0:
                        continue
                    y_pred = outputs[:, i * 2:(i + 1) * 2][torch.tensor(validId)].to(device)
                    y_label = tmp_y[:, i][torch.tensor(validId)].long().to(device)

                    # y_pred = torch.sigmoid(y_pred).view(-1)
                    # y_label = F.one_hot(y_label, 2).float().to(device)
                    loss += loss_function[i](y_pred, y_label)

                    pred_lable = F.softmax(y_pred.detach().cpu(), dim=-1)[:, 1].view(-1).numpy()
                    # pred_lable = np.zeros_like(y_pred.cpu().detach().numpy(), dtype=int)
                    # pred_lable[np.where(np.asarray(y_pred.cpu().detach().numpy()) > 0.5)] = 1
                    try:
                        y_true_task[i].extend(y_label.cpu().numpy())
                        y_pred_task[i].extend(pred_lable)
                        # y_pred_task_score[i].extend(y_pred)
                    except:
                        y_true_task[i] = []
                        y_pred_task[i] = []
                        # y_pred_task_score[i] = []
                        y_true_task[i].extend(y_label.cpu().numpy())
                        y_pred_task[i].extend(pred_lable)
                        # y_pred_task_score[i].extend(y_pred.cpu().detach().numpy())

                val_sum_loss.append(loss.cpu().detach().numpy())

            val_avg_loss = np.array(val_sum_loss).mean()

            trn_roc = [metrics.roc_auc_score(y_true_task[i], y_pred_task[i]) for i in range(tasks_num)]
            trn_prc = [metrics.auc(precision_recall_curve(y_true_task[i], y_pred_task[i])[1],
                                   precision_recall_curve(y_true_task[i], y_pred_task[i])[0]) for i in
                       range(tasks_num)]
            # acc = [metrics.accuracy_score(y_true_task[i], y_pred_task[i]) for i in range(tasks_num)]
            # recall = [metrics.recall_score(y_true_task[i], y_pred_task[i]) for i in range(tasks_num)]
            # specificity = [cm[i][0, 0] / (cm[i][0, 0] + cm[i][0, 1]) for i in range(tasks_num)]

            print("epoch:", epoch, "   val  "  "avg_loss:", val_avg_loss,
                  # "acc: ", np.array(acc).mean(),
                  # "recall: ", np.array(recall).mean(),
                  # "specificity: ", np.array(specificity).mean(),
                  # " val_auc: ", trn_roc,
                  " val_auc: ", np.array(trn_roc).mean(),
                  # " val_pr: ", trn_prc,
                  " val_pr: ", np.array(trn_prc).mean())

            # 保存模型
            if val_avg_loss < test_best_loss:
                test_best_loss = val_avg_loss
                PATH = 'tox/lstm_net.pth'
                print("test save model")
                torch.save(rnn.state_dict(), PATH)

                with torch.no_grad():
                    rnn.eval()
                    test_sum_loss = []
                    y_true_task = {}
                    y_pred_task = {}
                    y_pred_task_score = {}
                    for index, tmp in enumerate(dataset_test):
                        tmp_compound, tmp_y, tmp_smi = tmp
                        loss = 0
                        outputs = rnn(tmp_compound)
                        # out_label = F.softmax(outputs, dim=1)
                        # pred = out_label.data.max(1, keepdim=True)[1].view(-1).cpu().numpy()
                        # pred_score = [x[tmp_y.cpu().detach().numpy()[i]] for i, x in enumerate(out_label.cpu().detach().numpy())]
                        # y_pred.extend(pred)
                        # y_pred_score.extend(pred_score)
                        for i in range(tasks_num):
                            validId = np.where((tmp_y[:, i].cpu().numpy() == 0) | (tmp_y[:, i].cpu().numpy() == 1))[0]
                            if len(validId) == 0:
                                continue
                            y_pred = outputs[:, i * 2:(i + 1) * 2][torch.tensor(validId)].to(device)
                            y_label = tmp_y[:, i][torch.tensor(validId)].long().to(device)

                            # y_pred = torch.sigmoid(y_pred).view(-1)
                            # y_label = F.one_hot(y_label, 2).float().to(device)
                            loss += loss_function[i](y_pred, y_label)

                            y_pred_s = F.softmax(y_pred.detach().cpu(), dim=-1)[:, 1].view(-1).numpy()

                            pred_lable = np.zeros_like(y_pred_s, dtype=int)
                            pred_lable[np.where(np.asarray(y_pred_s) > 0.5)] = 1
                            try:
                                y_true_task[i].extend(y_label.cpu().numpy())
                                y_pred_task[i].extend(pred_lable)
                                y_pred_task_score[i].extend(y_pred_s)
                            except:
                                y_true_task[i] = []
                                y_pred_task[i] = []
                                y_pred_task_score[i] = []

                                y_true_task[i].extend(y_label.cpu().numpy())
                                y_pred_task[i].extend(pred_lable)
                                y_pred_task_score[i].extend(y_pred_s)

                        test_sum_loss.append(loss.cpu().detach().numpy())

                    trn_roc = [metrics.roc_auc_score(y_true_task[i], y_pred_task_score[i]) for i in range(tasks_num)]
                    trn_prc = [metrics.auc(precision_recall_curve(y_true_task[i], y_pred_task_score[i])[1],
                                           precision_recall_curve(y_true_task[i], y_pred_task_score[i])[0]) for i in
                               range(tasks_num)]
                    # print(len(trn_roc))
                    # print(sum(y_true_task[0]))
                    # print(sum(y_pred_task[0]))
                    acc = [metrics.accuracy_score(y_true_task[i], y_pred_task[i]) for i in range(tasks_num)]
                    # recall = [metrics.recall_score(y_true_task[i], y_pred_task[i]) for i in range(tasks_num)]
                    # specificity = [cm[i][0, 0] / (cm[i][0, 0] + cm[i][0, 1]) for i in range(tasks_num)]

                    print("epoch:", epoch, "   test  "  "avg_loss:", np.array(test_sum_loss).mean(),
                          "acc: ", np.array(acc).mean(),
                          # "recall: ", np.array(recall).mean(),
                          # "specificity: ", np.array(specificity).mean(),
                          # " test_auc: ", trn_roc,
                          " test_auc: ", np.array(trn_roc).mean(),
                          # " test_pr: ", trn_prc,
                          " test_pr: ", np.array(trn_prc).mean())