In [1]:
# подгружаемые библиотеки
import numpy as np
import tensorflow as tf
import re

from bpemb import BPEmb

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
vocab_size = 3000
emb_dim = 100
max_len = 100 # TODO посчитать максимальную длинну выборки
buffer_size = 101# размер буфера данных в батчах

In [3]:
bpemb_ru = BPEmb(lang='ru', vs=vocab_size , dim=emb_dim)

# bpemb_ru.encode_ids('.,!?:;')
#>>> [2898, 679, 2978, 2985, 2947, 2963]

def parse_fn(line, bpemb = bpemb_ru):

    sequence = np.array(bpemb.encode_ids(line)).astype(np.int32)

    feature = np.hstack(([1],sequence[:-1]))
    
    labels = ( #TODO изменить кастыли с циферками 2898-, 679-. 2978-! 2985-? 2947-: 2963-; 
              (sequence == 2898)*1 + 
              (sequence == 679)*2 + 
              (sequence == 2978)*3 + 
              (sequence == 2985)*4 + 
              (sequence == 2947)*5 + 
              (sequence == 2963)*6
             ).astype(np.int32) 
    
    return (feature, len(feature)), (labels, len(labels))


def generator_fn(data_file_url):
    with open(data_file_url, 'r') as file:
        for row in file:
            yield parse_fn(row[:-1])


def input_fn(data_file_url, params, mode):

    shapes = (([None], ()), ([None], ()))
    types = ((tf.int32, tf.int32), (tf.int32, tf.int32))
    defaults = ((0, 0), (0, 0))

    dataset = tf.data.Dataset.from_generator(lambda : generator_fn(data_file_url),
                                             output_shapes=shapes, output_types=types)
    
    if mode == 'train':
        dataset = dataset.shuffle(buffer_size=params['train_size'], reshuffle_each_iteration=True)
        dataset = dataset.repeat(params['num_epochs'])
    
    dataset = dataset.repeat(params['num_epochs'])
    return (dataset.padded_batch(params['batch_size'], shapes, defaults).prefetch(1))

In [4]:
# модель
def model_fn(features, labels, mode, params):
    

    # матрица эмбеддингов decoder-а
    embeddings = tf.get_variable('embedding_matrix',
                                 shape=[params['vocab_size'], params['embedding_dim']],
                                 dtype=tf.float32)
    
    # decoder
    cell  = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(num_units=params['lstm_hidden_dim']) 
                          for _ in range(params['num_layers'])])
    
    # началное состояние
    initial_state = tuple([tf.tile(tf.constant([[0]], tf.float32), [params['batch_size'], params['lstm_hidden_dim']]) 
                           for _ in range(params['num_layers'])]) 

    if (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL):
        
        sequences, lengths = features
        
        target, target_lengths = labels
    
        sequences_embedded = tf.nn.embedding_lookup(params=embeddings, ids=sequences)
        
        train_helper = tf.contrib.seq2seq.TrainingHelper(inputs=sequences_embedded, 
                                                          sequence_length=lengths)
        
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell, 
                                                  helper=train_helper, 
                                                  initial_state=initial_state
                                                 )
        
        outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,
                                                          maximum_iterations=params['max_iter'], 
                                                          impute_finished=True,
                                                          )

        logits = outputs.rnn_output
        sample_id = outputs.sample_id
        
        mask = tf.sequence_mask(lengths, dtype=tf.float32)
        
        metrics = {
            'acc': tf.metrics.accuracy(target, sample_id, mask),
            'f1_score' : tf.contrib.metrics.f1_score(target, sample_id, mask),
        }
        
        
        loss = tf.contrib.seq2seq.sequence_loss(logits=logits, 
                                                targets=target, 
                                                weights=mask, 
                                                average_across_timesteps=True, 
                                                average_across_batch=True)

        if mode == tf.estimator.ModeKeys.EVAL:
            return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)

        elif mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.AdamOptimizer
            optimizer = optimizer(learning_rate=params['learning_rate'])
            train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())

            return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
    
    
    elif mode == tf.estimator.ModeKeys.PREDICT:
        pass

In [5]:
params = {
    'vocab_size': vocab_size,
    'train_size': 256*buffer_size,
    'num_layers': 2,
    'embedding_dim': emb_dim,
    'lstm_hidden_dim': 300,
    'max_iter': max_len,
    'batch_size': 256,
    'num_epochs': 1,
    'learning_rate': 1e-3
}

config = tf.estimator.RunConfig(model_dir='project1',
                                save_checkpoints_steps = 200,
                               save_checkpoints_secs = None)
model = tf.estimator.Estimator(model_fn=model_fn, params=params, config=config)

INFO:tensorflow:Using config: {'_model_dir': 'project1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 200, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fbcfb87f9b0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [7]:
model.train(lambda: input_fn('flibusta_full_train.txt', params=params, mode='train'))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into project1/model.ckpt.
INFO:tensorflow:loss = 5.705028, step = 1
INFO:tensorflow:global_step/sec: 1.05024
INFO:tensorflow:loss = 3.7850385, step = 101 (95.218 sec)
INFO:tensorflow:Saving checkpoints for 200 into project1/model.ckpt.
INFO:tensorflow:global_step/sec: 1.08625
INFO:tensorflow:loss = 3.7645552, step = 201 (92.060 sec)
INFO:tensorflow:global_step/sec: 0.969008
INFO:tensorflow:loss = 3.7629735, step = 301 (103.198 sec)
INFO:tensorflow:Saving checkpoints for 400 into project1/model.ckpt.
INFO:tensorflow:global_step/sec: 1.0692
INFO:tensorflow:loss = 3.7611403, step = 401 (93.528 sec)


KeyboardInterrupt: 

In [None]:
# обучение и валидация

In [None]:
# дешифровка 

In [None]:
#