In [1]:
!pip install keras==2.3.1

Looking in indexes: http://mirrors.tencentyun.com/pypi/simple
Collecting keras==2.3.1
  Downloading http://mirrors.tencentyun.com/pypi/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377 kB)
[K     |████████████████████████████████| 377 kB 9.4 MB/s eta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.3.1


In [68]:
import os
import keras.backend as K

from data import DATA_SET_DIR
from elmo.lm_generator import LMDataGenerator
from elmo.model import ELMo

from tqdm import tqdm

In [74]:
data = []
with open('./data/datasets/txt/advertiser_id.all.tokens', 'r') as f:
    for i in tqdm(f.readlines()):
        data.append(i[:-1])
print('划分数据集')
with open('./data/datasets/txt/advertiser_id.train.tokens', 'w') as f:
    for i in tqdm(data[:1750000]):
        f.write(i)
        f.write('\n')
with open('./data/datasets/txt/advertiser_id.valid.tokens', 'w') as f:
    for i in tqdm(data[1750000:2000000]):
        f.write(i)
        f.write('\n')
with open('./data/datasets/txt/advertiser_id.test.tokens', 'w') as f:
    for i in tqdm(data[2000000:]):
        f.write(i)
        f.write('\n')

100%|██████████| 4000000/4000000 [00:03<00:00, 1104970.88it/s]
  4%|▎         | 61941/1750000 [00:00<00:02, 619403.21it/s]

划分数据集


100%|██████████| 1750000/1750000 [00:02<00:00, 700794.78it/s]
100%|██████████| 250000/250000 [00:00<00:00, 636351.00it/s]
100%|██████████| 2000000/2000000 [00:02<00:00, 697327.92it/s]


In [75]:
# 利用验证、训练数据制作词表
data = []
with open('./data/datasets/txt/advertiser_id.train.tokens', 'r') as f:
    for i in tqdm(f.readlines()):
        data.append(i[:-1])
with open('./data/datasets/txt/advertiser_id.valid.tokens', 'r') as f:
    for i in tqdm(f.readlines()):
        data.append(i[:-1])

words = []
for i in data:
    words += i.split(' ')
words = set(words) - set(('<unk>',))
vocab = {}
vocab['<pad>'] = 0
vocab['<bos>'] = 1
vocab['<eos>'] = 2
vocab['<unk>'] = 3
i = 0
for i, word in tqdm(enumerate(words)):
    vocab[word] = i + 4
with open('./data/datasets/txt/advertiser_id.vocab', 'w') as f:
    for i in tqdm(vocab):
        f.write('{} {}\n'.format(i, vocab[i]))

100%|██████████| 1750000/1750000 [00:01<00:00, 1056098.59it/s]
100%|██████████| 250000/250000 [00:00<00:00, 965795.72it/s]


In [90]:
os.cpu_count()

8

In [None]:
parameters = {
    'multi_processing': True,
    'n_threads': os.cpu_count(),
    'cuDNN': True if len(K.tensorflow_backend._get_available_gpus()) else False,
    'train_dataset': 'txt/advertiser_id.train.tokens',
    'valid_dataset': 'txt/advertiser_id.valid.tokens',
    'test_dataset': 'txt/advertiser_id.test.tokens',
    'vocab': 'txt/advertiser_id.vocab',
    'vocab_size': 54837,
    'num_sampled': 1000,
    'charset_size': 262,
    'sentence_maxlen': 100,
    'token_maxlen': 50,
    'token_encoding': 'word',
    'epochs': 10,
    'patience': 2,
    'batch_size': 16,
    'clip_value': 1,
    'cell_clip': 5,
    'proj_clip': 5,
    'lr': 0.2,
    'shuffle': True,
    'n_lstm_layers': 2,
    'n_highway_layers': 2,
    'cnn_filters': [[1, 32],
                    [2, 32],
                    [3, 64],
                    [4, 128],
                    [5, 256],
                    [6, 512],
                    [7, 512]
                    ],
    'lstm_units_size': 400,
    'hidden_units_size': 200,
    'char_embedding_size': 16,
    'dropout_rate': 0.1,
    'word_dropout_rate': 0.05,
    'weight_tying': True,
}

# Set-up Generators
train_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['train_dataset']),
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding'])

val_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['valid_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

test_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['test_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

# Compile ELMo
print('compile')
elmo_model = ELMo(parameters)
elmo_model.compile_elmo(print_summary=True)

# Train ELMo
print('train')
elmo_model.train(train_data=train_generator, valid_data=val_generator)

# Persist ELMo Bidirectional Language Model in disk
print('save')
elmo_model.save(sampled_softmax=False)

# Evaluate Bidirectional Language Model
print('evaluate')
elmo_model.evaluate(test_generator)

# Build ELMo meta-model to deploy for production and persist in disk
print('??')
elmo_model.wrap_multi_elmo_encoder(print_summary=True, save=True)

# Load ELMo encoder
print('load')
elmo_model.load_elmo_encoder()

# Get ELMo embeddings to feed as inputs for downstream tasks
elmo_embeddings = elmo_model.get_outputs(test_generator, output_type='word', state='mean')

# BUILD & TRAIN NEW KERAS MODEL FOR DOWNSTREAM TASK (E.G., TEXT CLASSIFICATION)


compile


  'be expecting any data to be passed to {0}.'.format(name))
  'be expecting any data to be passed to {0}.'.format(name))


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_indices (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
token_encoding (Embedding)      (None, None, 200)    10967400    word_indices[0][0]               
__________________________________________________________________________________________________
spatial_dropout1d_11 (SpatialDr (None, None, 200)    0           token_encoding[0][0]             
__________________________________________________________________________________________________
timestep_dropout_3 (TimestepDro (None, None, 200)    0           spatial_dropout1d_11[0][0]       
____________________________________________________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
   71/54688 [..............................] - ETA: 22:24:58 - loss: 120.5444

Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-8:
Process ForkPoolWorker-16:
Process ForkPoolWorker-14:
Process ForkPoolWorker-12:
Process ForkPoolWorker-6:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/tensorflow_py3/lib/pyt

In [61]:
parameters = {
    'multi_processing': False,
    'n_threads': 4,
    'cuDNN': True if len(K.tensorflow_backend._get_available_gpus()) else False,
    'train_dataset': 'txt/advertiser_id.demo.tokens',
    'valid_dataset': 'txt/advertiser_id.demo.tokens',
    'test_dataset': 'txt/advertiser_id.demo.tokens',
    'vocab': 'txt/advertiser_id.demo.vocab',
    'vocab_size': len(vocab),
    'num_sampled': 1000,
    'charset_size': 262,
    'sentence_maxlen': 100,
    'token_maxlen': 50,
    'token_encoding': 'word',
    'epochs': 10,
    'patience': 2,
    'batch_size': 1,
    'clip_value': 1,
    'cell_clip': 5,
    'proj_clip': 5,
    'lr': 0.2,
    'shuffle': True,
    'n_lstm_layers': 2,
    'n_highway_layers': 2,
    'cnn_filters': [[1, 32],
                    [2, 32],
                    [3, 64],
                    [4, 128],
                    [5, 256],
                    [6, 512],
                    [7, 512]
                    ],
    'lstm_units_size': 400,
    'hidden_units_size': 200,
    'char_embedding_size': 16,
    'dropout_rate': 0.1,
    'word_dropout_rate': 0.05,
    'weight_tying': True,
}

# Set-up Generators
train_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['train_dataset']),
                                  os.path.join(DATA_SET_DIR, parameters['vocab']),
                                  sentence_maxlen=parameters['sentence_maxlen'],
                                  token_maxlen=parameters['token_maxlen'],
                                  batch_size=parameters['batch_size'],
                                  shuffle=parameters['shuffle'],
                                  token_encoding=parameters['token_encoding'])

val_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['valid_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

test_generator = LMDataGenerator(os.path.join(DATA_SET_DIR, parameters['test_dataset']),
                                os.path.join(DATA_SET_DIR, parameters['vocab']),
                                sentence_maxlen=parameters['sentence_maxlen'],
                                token_maxlen=parameters['token_maxlen'],
                                batch_size=parameters['batch_size'],
                                shuffle=parameters['shuffle'],
                                token_encoding=parameters['token_encoding'])

# Compile ELMo
print('compile')
elmo_model = ELMo(parameters)
elmo_model.compile_elmo(print_summary=True)

# Train ELMo
print('train')
elmo_model.train(train_data=train_generator, valid_data=val_generator)

# Persist ELMo Bidirectional Language Model in disk
print('save')
elmo_model.save(sampled_softmax=False)

# Evaluate Bidirectional Language Model
print('evaluate')
elmo_model.evaluate(test_generator)

# Build ELMo meta-model to deploy for production and persist in disk
print('??')
elmo_model.wrap_multi_elmo_encoder(print_summary=True, save=True)

# Load ELMo encoder
print('load')
elmo_model.load_elmo_encoder()

# Get ELMo embeddings to feed as inputs for downstream tasks
elmo_embeddings = elmo_model.get_outputs(test_generator, output_type='word', state='mean')

# BUILD & TRAIN NEW KERAS MODEL FOR DOWNSTREAM TASK (E.G., TEXT CLASSIFICATION)


compile


  'be expecting any data to be passed to {0}.'.format(name))
  'be expecting any data to be passed to {0}.'.format(name))


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_indices (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
token_encoding (Embedding)      (None, None, 200)    252000      word_indices[0][0]               
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, None, 200)    0           token_encoding[0][0]             
__________________________________________________________________________________________________
timestep_dropout_1 (TimestepDro (None, None, 200)    0           spatial_dropout1d_1[0][0]        
____________________________________________________________________________________________

Exception ignored in: <bound method ELMo.__del__ of <elmo.model.ELMo object at 0x7f31ac238588>>
Traceback (most recent call last):
  File "/home/tione/notebook/ELMo-keras/elmo/model.py", line 29, in __del__
    K.clear_session()
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 414, in clear_session
    tf_keras_backend.clear_session()
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py", line 232, in clear_session
    ops.reset_default_graph()
  File "/opt/conda/envs/tensorflow_py3/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 5852, in reset_default_graph
    raise AssertionError("Do not use tf.reset_default_graph() to clear "
AssertionError: Do not use tf.reset_default_graph() to clear nested graphs. If you need a cleared graph, exit the nesting and create a new graph.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.39525, saving model to /home/tione/notebook/ELMo-keras/data/models/elmo_best_weights.hdf5
Epoch 2/10

Epoch 00002: val_loss did not improve from 1.39525
Epoch 3/10

Epoch 00003: val_loss did not improve from 1.39525
Epoch 4/10

Epoch 00004: val_loss did not improve from 1.39525
Epoch 5/10

Epoch 00005: val_loss did not improve from 1.39525
Epoch 6/10

Epoch 00006: val_loss did not improve from 1.39525
Epoch 7/10

Epoch 00007: val_loss did not improve from 1.39525
Epoch 8/10

Epoch 00008: val_loss did not improve from 1.39525
Epoch 9/10

Epoch 00009: val_loss did not improve from 1.39525
Epoch 10/10

Epoch 00010: val_loss did not improve from 1.39525
Training took 72.30446577072144 sec
save


  'be expecting any data to be passed to {0}.'.format(name))


ELMo Language Model saved successfully
evaluate
Forward Langauge Model Perplexity: 61232.054662230556
Backward Langauge Model Perplexity: 16723.494255278456
??
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
word_indices (InputLayer)       (None, None)         0                                            
__________________________________________________________________________________________________
token_encoding (Embedding)      (None, None, 200)    252000      word_indices[0][0]               
__________________________________________________________________________________________________
spatial_dropout1d_6 (SpatialDro (None, None, 200)    0           token_encoding[0][0]             
__________________________________________________________________________________________________
timestep_dropout_2 (TimestepDro



array([[-2.5028853 ,  3.0094678 ,  5.5926957 , ...,  1.5612016 ,
        -0.11298636,  0.47743785],
       [-3.2986498 ,  3.639497  ,  6.418512  , ...,  2.088767  ,
         0.02384794,  0.9044669 ],
       [-3.80541   ,  4.118277  ,  7.1731715 , ...,  2.4383476 ,
         0.1849048 ,  1.23097   ],
       ...,
       [-3.160855  ,  3.628611  ,  6.444164  , ...,  2.0874894 ,
        -0.00931854,  0.88850296],
       [-3.5049515 ,  3.9199624 ,  6.8536725 , ...,  2.2042162 ,
         0.11237423,  1.0343972 ],
       [-4.549523  ,  4.703183  ,  7.9132166 , ...,  2.8450491 ,
         0.28287318,  1.5878644 ]], dtype=float32)