In [1]:
import numpy as np
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim import corpora
from collections import defaultdict
import pickle
import os

In [2]:
UNDEFINED = "_"
MAX_WORD_LENGTH = 20


def read_gikrya(path):
    """
    Reading format:
    row_index<TAB>form<TAB>lemma<TAB>POS<TAB>tag
    """
    morpho_map = {"POS":{UNDEFINED: 0, 
                         0: UNDEFINED}}
    
    
    sentences = []
    vocab = {}    
    with open(path, 'r') as f:
        sentence = []
        for line in f:
            splits = line.strip().split('\t')            
            if len(splits) == 5:
                form, lemma, POS, tags = splits[1:]
                if POS not in  morpho_map["POS"]:
                    morpho_map["POS"][POS] = len(morpho_map["POS"]) // 2 
                    morpho_map["POS"][morpho_map["POS"][POS]] =  POS
                tags_list = [("POS", POS)]
                if tags != "_":
                    for tag_val in tags.split("|"):
                        tag, val = tag_val.split("=")
                        tags_list.append((tag, val))
                        if tag not in morpho_map:
                            morpho_map[tag] = {UNDEFINED: 0,
                                               0: UNDEFINED}
                        if val not in morpho_map[tag]:
                            morpho_map[tag][val] = len(morpho_map[tag]) // 2 
                            morpho_map[tag][morpho_map[tag][val]] = val
#                 else:
#                     tags_list.append(tags)
                if form not in vocab:
                    vocab[form] = form
                sentence.append((vocab[form], lemma, tags_list) )
                
                    
            elif len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    return sentences, morpho_map       
 
    
def split_word(word, stemmer):
    flex = word[len(stemmer.stem(word)):]
    if len(flex):
        return word[:-len(flex)], flex
    return word, "empty"


def sentences_to_features(sentences, token2id, neighbors=3, undef_token="undefined_token"):
    arrays = [sentence_to_features(sent, token2id,  neighbors=neighbors,
                                  undef_token=undef_token) for sent in sentences]
    return np.vstack(arrays)


def sentence_to_features(sentence, token2id, 
                         neighbors=3, undef_token="undefined_token"):
    """
    Делает из предложения 
    матрицу id слов, где  строка соответствует словам предложения:
    в каждой строке состоит из neighbors id слов из левого контекста,
    потом id слова, затем neighbors id слов правого контекста
    0 - зарезерврован для паддинга, в словаре не должно быть слов с id 0
    """
    X = np.ones(shape=(len(sentence), neighbors * 2 + 1), dtype=np.int) * len(token2id)
    id_seq = np.zeros(shape=(len(sentence) + 2*neighbors,), dtype=np.int)
    for idx, token in enumerate(sentence):
        num = token2id.get(token, token2id[undef_token])
        # assert num != 0
        id_seq[idx+neighbors] = num
    for idx in range(len(sentence)):
        X[idx, :] = id_seq[idx:idx + X.shape[1]]
    return X   
        
        
def build_vocab(sentences, min_freq=0, max_size=10000, undefined_id=0):
    """ 
    Строит словарь из слов встертившихся более min_freq раз,
    но размеров  не более max_size, в случае бОльшего количества токенов
    отбрасываются менее частотные токены, undefined_id - id первого токена в словаре,
    который будет называться "undefined_token"
    """
    offset = undefined_id
    token2id = {"undefined_token": offset}
    id2token = {offset: "undefined_token"}    
    
    counter = defaultdict(int)    
    for sentence in sentences:
        for token in sentence:
            counter[token] += 1
    sorted_tokens = [t_f[0]  for t_f in 
                     sorted([t_f for t_f in counter.items() if t_f[1] >= min_freq],
                           key=lambda tf: -tf[1])]                     
    
    for token in sorted_tokens[:max_size - len(token2id)]:
        offset += 1
        token2id[token] = offset
        id2token[offset] = token
    return token2id, id2token      
    
    
    
def simple_word2vec(word):
    pass


def build_morpho_vocab(morpho_map):
    morpho_сats = sorted([key for key in morpho_map.keys()])
    # чисто для удобства POS сделаем первым
    morpho_сats.insert(0, morpho_сats.pop(morpho_сats.index("POS"))) 
    abs_idx = 0
    tag2id = {}
    id2tag = {}
    for cat in morpho_сats:
        vals = [pair[0] for pair in sorted(list(morpho_map[cat].items()), 
                                           key=lambda p: p[1])]
        for val in vals:
            tag2id[(cat, val)] = abs_idx
            id2tag[abs_idx] = (cat, val)
            abs_idx += 1
    return tag2id, id2tag  


def tagsets_to_one_hot(tagsets, morpho_map, cat_order):    
    # при частых запусках не оптимально так:
    # cats = set([cat for cat, val in tag2id.keys()])
    y = [np.zeros(shape=(len(tagsets), len(morpho_map[cat]) // 2), dtype=np.int) 
         for cat in cat_order]
    
    for one_hot in y:
        one_hot[:, 0] = 1       
        
    for idx, tagset in enumerate(tagsets):                    
        for cat, tag in tagset:
            # не очень эффективно индекс искать постоянно
            
            cat_id = cat_order.index(cat)    
            if cat_id >= 0:
                y[cat_id][idx, 0] = 0
                y[cat_id][idx, morpho_map[cat].get(tag, 0)] = 1            
    return y
        
    
def preproc_dataset(full_tag_sentences, stemmer):    
    sentences = []
    flexes = []
    token_tags = []
    tokens = []
    
    for sent in full_tag_sentences:
        temp_sent = []
        temp_flexes = []
        for token_info in sent:
            token = token_info[0].lower()
            tokens.append(token)
            splits = split_word(token, stemmer)
            temp_sent.append(splits[0])
            temp_flexes.append(splits[1])
            token_tags.append(token_info[2])  # надо бы переделать под стиль sentences или?          
        sentences.append(temp_sent)
        flexes.append(temp_flexes)    
    return sentences, flexes, token_tags, tokens
    

       
def make_dataset(path, stemmer,
                      morpho_map, cat_order, undef_token,
                      token2id, flex2id, char2id,
                      neighbors=3):
    full_tag_sentences, _ = read_gikrya(path)    
    sentences, flexes, token_tags, tokens = preproc_dataset(full_tag_sentences, stemmer)
    X_stem = sentences_to_features(sentences, token2id,
                                   neighbors=neighbors, undef_token=undef_token)
    X_flex = sentences_to_features(flexes, flex2id, 
                                   neighbors=neighbors,
                                   undef_token=undef_token)
    X = chars_to_features(tokens, char2id)
    y = tagsets_to_one_hot(token_tags, morpho_map, cat_order)
    return X_stem, X_flex, X, y, sentences, flexes, token_tags, full_tag_sentences 


def chars_to_features(tokens, char2id):
    X = np.ones(shape=(len(tokens), MAX_WORD_LENGTH), dtype=np.int) * len(char2id)
    for idx, token in enumerate(tokens):
        for chid in range(min(MAX_WORD_LENGTH, len(token))):
            X[idx, chid] = char2id.get(token[-chid-1], 0)
    return X        
    
        


def add_tags_to_sentences(full_tag_sentences, y, morpho_map, cat_order):
    new_full_tag_sents = []
    idx = 0
    for full_tag_sent in full_tag_sentences:
        new_full_tag = []   
        for token_info in full_tag_sent:
            tags = []
            for cat, oh_val in zip(cat_order, y):
                
                ntag = oh_val.shape[1]
                tags.append((cat,
                            [morpho_map[cat][i] for i in range(ntag) if oh_val[idx, i]==1][0]))
            new_full_tag.append((token_info[0],
                                '_',
                                tags))
            idx += 1
        new_full_tag_sents.append(new_full_tag)
    return new_full_tag_sents


def probs_to_one_hot(probs):
    one_hot = np.zeros_like(probs, dtype=np.int)
    for row in range(one_hot.shape[0]):
        one_hot[row, np.argmax(probs[row, :])] =1
    return one_hot


def many_probs_to_one_hot(probs):
    return [probs_to_one_hot(prob) for prob in probs]


def write_gikrya(path, full_tags):
    with open(path, 'w') as f:
        idx = 0
        for sentence in full_tags:
            for i, token_info in enumerate(sentence):
                f.write("{}\t{}\t{}\t{}\n".format(i+1,
                                                token_info[0],
                                                token_info[1],
                                                tagset2str(token_info[2])))
            f.write("\n")
            
                

                    
def tagset2str(tagset):
    POS = ""
    tags_list = []
    for tag, val in tagset:
        if  tag == "POS":
            POS = val
        else:
            if val != UNDEFINED:
                tags_list.append("{}={}".format(tag, val))
    tags = "_"
    if len(tags_list) > 0:
        tags = "|".join(tags_list)
    return "{}\t{}".format(POS, tags)
        
    
def read_embeddings(vocab_path, emb_path):    
    token2id = {}
    tokens = open(vocab_path, "r").read().strip().split("\n")    
    for i, ch in enumerate(tokens):
        token2id[ch] = i
    vecs = np.load(emb_path)
    rnd_vec = np.random.uniform(size=vecs.shape[1]) 
    return token2id, np.vstack((vecs, rnd_vec / np.linalg.norm(rnd_vec))),  tokens[0]

In [3]:
"sdfa"[-2]

'f'

In [49]:
# test part
# token2id, id2token = buil_vocab(sentences_for_w2v, min_freq=1000, max_size=1000)
# sent = sentences_for_w2v[0]
# print(" ".join(sent))
# print(" ".join(map(lambda t: str(token2id.get(t)), sent)))
# sentence_to_features(sent, token2id)
# full_tag_sentences, morpho_map = read_gikrya(gikrya_path)
# gikrya_path = "../morphoRuEval-2017/Baseline/source/gikrya_train.txt"
# full_tag_sentences, morpho_map = read_gikrya(gikrya_path)
# tag2id, id2tag = build_morpho_vocab(morpho_map)
# cat_order = sorted([key for key in morpho_map.keys()])
# token_tags[0]
# yy = tagsets_to_one_hot(token_tags, morpho_map, cat_order)
# morpho_map["POS"]["NOUN"]
# one_hots = many_probs_to_one_hot(predicted)
# test_predicted_ft = add_tags_to_sentences()
# test_predicted = add_tags_to_sentences(full_tag_test, one_hots, morpho_map, cat_order)
# write_gikrya("../morphoRuEval-2017/Baseline/predict/gikrya_test_nnet.txt", test_predicted)
# s =[[(x, 0, 0) for x in """Комната
# в
# весёлых
# пробегах
# огней
# .""".split("\n")]]

# x, xf, _ = preproc_dataset(s,  stemmer)
# x = sentences_to_features(x, token2id)
# xf = sentences_to_features(xf, flex2id)

predicted = model.predict([X_stem_test, X_flex_test, X_test])
one_hots = many_probs_to_one_hot(predicted)
test = add_tags_to_sentences(full_tag_test, one_hots, morpho_map, cat_order)
write_gikrya("../morphoRuEval-2017/Baseline/predict/gikrya_test_nnet2.txt", test)

In [17]:
# model = model_from_json(open('../models/model_3/model_json.arch', 'r').read())
model.load_weights('../models/model_4/weights.model')

In [56]:
prefix = "../morphoRuEval-2017/test_collection/" 
# tests = ["VK.txt"]#, "JZ.txt", "Lenta.txt"]
tests = ["JZ.txt", "Lenta.txt"]

for p in tests:
    path = os.path.join(prefix, p)
    X_stem_test, X_flex_test, X_test, y_test, sentences, flexes,\
    token_tags, full_tag_test = make_dataset(path, stemmer, 
                                                morpho_map, cat_order,undef_token, 
                                                token2id, flex2id, char2id, neighbors=neighbors)
    predicted = model.predict([X_stem_test, X_flex_test, X_test], batch_size=512)
    one_hots = many_probs_to_one_hot(predicted)
    test = add_tags_to_sentences(full_tag_test, one_hots, morpho_map, cat_order)
    write_gikrya("../morphoRuEval-2017/test_collection/tagged/{}".format(p), test)

In [3]:
1+1 

2

In [7]:
prefix = "../models/"
morpho_map, cat_order =  pickle.load(open("../models/morpho.pickle", 'rb'))
token2id, token_vecs, undef_token = read_embeddings(os.path.join(prefix, "stem2id"),
                                      os.path.join(prefix, "stem_embeddings.npy"))

flex2id, flex_vecs, _ = read_embeddings(os.path.join(prefix, "flex2id"),
                                     os.path.join(prefix, "flex_embeddings.npy"))

# lemm2id, lemm_vecs, _ = read_embeddings(os.path.join(prefix, "lemm2id"),
#                                      os.path.join(prefix, "lemm_embeddings.npy"))

char2id = {}
chars = open("char2id", "r").read().strip().split("\n")
for i, ch in enumerate(chars):
    char2id[ch] = i

stemmer = SnowballStemmer("russian")

In [8]:
char2id["o"]

13

In [10]:
neighbors = 3
gikrya_path = "../morphoRuEval-2017/Baseline/source/gikrya_train.txt"
# gikrya_path = "../JointMorphoClosed.txt"
X_stem_train, X_flex_train, X_train, y_train, sentences, flexes,\
token_tags, full_tag_sentences=make_dataset(gikrya_path, stemmer, 
                                            morpho_map, cat_order, undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

In [11]:
gikrya_test = "../morphoRuEval-2017/Baseline/source/gikrya_test.txt"
X_stem_test, X_flex_test, X_test, y_test, sentences, flexes,\
token_tags, full_tag_test = make_dataset(gikrya_test, stemmer, 
                                            morpho_map, cat_order,undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

In [27]:
X_train.shape, X_flex_train.shape, X_train.shape, len(y_train)

((815884, 20), (815884, 7), (815884, 20), 13)

In [8]:
# X_train.shape, X_flex_train.shape, X_train.shape, len(y_train)

In [9]:
X_test.shape, X_flex_test.shape, X_test.shape, len(y_test)

((270264, 20), (270264, 11), (270264, 20), 13)

In [7]:
len(token2id), len(flex2id), len(char2id)

(60000, 450, 50)

In [52]:
X_stem, X_flex, X = np.vstack((X_stem_train, X_stem_test)),\
                                   np.vstack((X_flex_train, X_flex_test)),\
                                   np.vstack((X_train, X_test))

y = [np.vstack((train, test)) for train, test in zip(y_train, y_test)]

In [32]:
token_vecs.shape

(50223, 128)

# A PLAN:
Итак пока идея такая: берем [mnist_hierarchical_rnn.py](https://github.com/fchollet/keras/blob/master/examples/mnist_hierarchical_rnn.py) 
и юзаем
На вход подаем вектор стемма и отдельно вектор флексии, думаю для этого можно использовать
embedding layers

TO DO:
- ~~запилить хотя бы просто CountVectorizer для формирования интов на входы
embeddings~~ 

> запилил build_vocab

- ~~запилить преобразования целевых тэгов в onehot или еще как~~
- ~~поднять сеть HierarchicalRNN со стеммаим на входе  и только POS на выходе~~
- ~~допилить туда флексии~~
- ~~попробовать поменять LSTM на SimpleRNN~~

> учиться чуть быстрее(примерно в 4 раза) результаты чуть медленнее сходятся (примерно в 2 раз)..

- ~~попробовать GRU ?~~
- ~~допилить классификаторы для остальных тэггов~~
- ~~make ud format output for eval task results~~
- ~~прописать коллбэки для earl
y_stopping и сохранения лучшей модели~~
- попробовать контекст побольше?
- настройка геракла
- сделать замену пунктуации на тэг PUNCT
- сделать скрипт обучения модели, вход: [коллекция, директория для модели, [тест для валидации]]
выход: сохраненная модель
- сделать скрипт разметки вход: [коллекция, модель] выход: ud разметка
- проверить UNDEFINED
- ???
- profit

#### Литература:

1. [A Hierarchical Neural Autoencoder for Paragraphs and Documents](https://arxiv.org/pdf/1506.01057.pdf)

In [61]:
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed, Embedding, Bidirectional, Merge
from keras.layers import LSTM, SimpleRNN, GRU, Dropout, RepeatVector
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import model_from_json
import os


In [27]:
# X = X_train
# X_flex = X_flex_train
# y = y_train[:, :len(morpho_map["POS"])]



# Embedding dimensions.
# tok2vec_dim = 128
# flex2vec_dim = 32
# char2vec_dim = 20

token_hidden = 128
flex_hidden = 64
char_hidden = 128
token_vecs.shape

(60001, 200)

In [31]:
len(ch), (X.shape)

(80, (1086148, 20))

In [75]:
stem_in = Input(shape=(X_stem_train.shape[1],))
flex_in = Input(shape=(X_flex_train.shape[1],))
x_in = Input(shape=(X_train.shape[1],))

# stem_only_in = Input(shape=(1,))
# flex_only_in = Input(shape=(1,))

stem_embedding = Embedding(input_dim=token_vecs.shape[0], 
                           output_dim=token_vecs.shape[1],
                           weights=[token_vecs])
# root_embedding.trainable = False
#root_embedding_r = Reshape((X_train.shape[1], tok2vec_dim, 1))(root_embedding)
#root_encoded_col = TimeDistributed(LSTM(token_hidden,
#                                  dropout_U=0.1, 
#                                  dropout_W=0.1))(root_embedding_r)


flex_embedding = Embedding(input_dim=flex_vecs.shape[0],
                           output_dim=flex_vecs.shape[1],
                           weights=[flex_vecs])
# flex_embedding.trainable = False
#flex_embedding_r = Reshape((X_flex_train.shape[1], flex2vec_dim, 1))(flex_embedding)
#flex_encoded_col = TimeDistributed(LSTM(flex_hidden,
#                                  dropout_U=0.1, 
#                                  dropout_W=0.1))(flex_embedding_r)
char_embedding = Embedding(input_dim=len(char2id) + 1,
                           output_dim=char2vec_dim)


encoded_stem = Bidirectional(LSTM(token_hidden,
                                   dropout_U=0.1, 
                                   dropout_W=0.1))(stem_embedding(stem_in))


encoded_flex = Bidirectional(LSTM(flex_hidden,
                                   dropout_U=0.1, 
                                   dropout_W=0.1))(flex_embedding(flex_in))                


encoded_x = Bidirectional(LSTM(char_hidden,
                                   dropout_U=0.1, 
                                   dropout_W=0.1))(char_embedding(x_in))    


# stem_vec = keras.layers.Flatten()(root_embedding(stem_only_in))
# flex_vec = keras.layers.Flatten()(flex_embedding(flex_only_in))
# x_vec = keras.layers.Flatten()(flex_embedding(flex_only_in))


merge_encoded = keras.layers.merge([encoded_stem, 
                                    encoded_flex,
                                    encoded_x],
                                    mode='concat')


drop = Dropout(0.25)(merge_encoded)
output = Dense(output_dim=400, activation='softmax')(drop)
dropped_output = Dropout(0.5)(output)
# prediction = Dense(output_dim=num_classes, activation='softmax')(merge_encoded)
# pos_out = Dense(output_dim=morpho_map['POS'], activation='softmax')(dropped_output)
# repited_hidden = RepeatVector()
predictions = [Dense(output_dim=tag_y.shape[1], activation='softmax', name=cat)(dropped_output)
              for cat, tag_y in zip(cat_order, y_train)]



In [64]:
# morpho_map['POS']
x = model.layers[-1]
x

'dense_84'

In [76]:
model = Model([stem_in, flex_in, x_in], predictions)
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
             metrics=['accuracy'])

In [23]:
model_path = "../models/model_4/"
open(os.path.join(model_path, 'model_json.arch'), 'w' ).write(model.to_json())
model_checkpoint = ModelCheckpoint(os.path.join(model_path,'weights.model'),
                                   monitor='val_loss', verbose=1,
                                   save_best_only=True, mode='auto')

early_stopping = EarlyStopping( monitor='val_loss', patience=5, verbose=1, mode='auto')

KeyError: 'input_6_ib-0'

###  model.fit  происходит здесь:

In [77]:
cat_order

['Animacy',
 'Case',
 'Degree',
 'Form',
 'Gender',
 'Mood',
 'Number',
 'POS',
 'Person',
 'Tense',
 'Variant',
 'VerbForm',
 'Voice']

In [80]:
batch_size = 512
epochs = 15
model.fit([X_stem, X_flex, X], y, validation_split=0.07,
          #validation_data=([X_stem_test, X_flex_test, X_test], y_test),
          batch_size=batch_size, nb_epoch=epochs, 
          verbose=2) #, callbacks=[model_checkpoint, early_stopping])

Train on 1360643 samples, validate on 102415 samples
Epoch 1/15
564s - loss: 7.9048 - Animacy_loss: 0.5036 - Case_loss: 1.1747 - Degree_loss: 0.4203 - Form_loss: 0.1219 - Gender_loss: 0.9080 - Mood_loss: 0.3448 - Number_loss: 0.7662 - POS_loss: 1.7803 - Person_loss: 0.4858 - Tense_loss: 0.4314 - Variant_loss: 0.1412 - VerbForm_loss: 0.4640 - Voice_loss: 0.3625 - Animacy_acc: 0.8305 - Case_acc: 0.6250 - Degree_acc: 0.8802 - Form_acc: 0.9918 - Gender_acc: 0.6571 - Mood_acc: 0.9170 - Number_acc: 0.6637 - POS_acc: 0.4382 - Person_acc: 0.8896 - Tense_acc: 0.9028 - Variant_acc: 0.9860 - VerbForm_acc: 0.8918 - Voice_acc: 0.9083 - val_loss: 4.0095 - val_Animacy_loss: 0.2227 - val_Case_loss: 0.7628 - val_Degree_loss: 0.1569 - val_Form_loss: 0.0688 - val_Gender_loss: 0.6095 - val_Mood_loss: 0.1104 - val_Number_loss: 0.4256 - val_POS_loss: 0.9201 - val_Person_loss: 0.2916 - val_Tense_loss: 0.1533 - val_Variant_loss: 0.0529 - val_VerbForm_loss: 0.1153 - val_Voice_loss: 0.1196 - val_Animacy_acc: 0.

<keras.callbacks.History at 0x7f68faa620f0>

In [37]:
# model.layers

[<keras.engine.topology.InputLayer at 0x7f6992b77080>,
 <keras.engine.topology.InputLayer at 0x7f6992b77048>,
 <keras.layers.embeddings.Embedding at 0x7f6992b77320>,
 <keras.layers.embeddings.Embedding at 0x7f6992b77518>,
 <keras.layers.wrappers.Bidirectional at 0x7f6992b77438>,
 <keras.layers.wrappers.Bidirectional at 0x7f69918541d0>,
 <keras.engine.topology.Merge at 0x7f69931c3518>,
 <keras.layers.core.Dropout at 0x7f6991845630>,
 <keras.layers.core.Dense at 0x7f6997d55860>,
 <keras.layers.core.Dropout at 0x7f6997cfbc88>,
 <keras.layers.core.Dense at 0x7f6997de80b8>,
 <keras.layers.core.Dense at 0x7f6997e0fba8>,
 <keras.layers.core.Dense at 0x7f6997e0a710>,
 <keras.layers.core.Dense at 0x7f6997e06550>,
 <keras.layers.core.Dense at 0x7f6997e13668>,
 <keras.layers.core.Dense at 0x7f6997dfddd8>,
 <keras.layers.core.Dense at 0x7f6997df9978>,
 <keras.layers.core.Dense at 0x7f6997e18da0>,
 <keras.layers.core.Dense at 0x7f6a1be500b8>,
 <keras.layers.core.Dense at 0x7f6a1d5b68d0>,
 <keras.la

In [79]:
batch_size = 512
epochs = 1
model.fit([X_stem, X_flex, X], y, validation_split=0.05,
          # validation_data=([X_stem_test, X_flex_test, X_test], y_test)
          batch_size=batch_size, nb_epoch=epochs,
          verbose=2, callbacks=[model_checkpoint, early_stopping])

NameError: name 'model_checkpoint' is not defined

In [21]:
model.layers

[<keras.engine.topology.InputLayer at 0x7fe1e6af8588>,
 <keras.engine.topology.InputLayer at 0x7fe1e6af8550>,
 <keras.engine.topology.InputLayer at 0x7fe1e6af8748>,
 <keras.layers.embeddings.Embedding at 0x7fe1e6af8f28>,
 <keras.layers.embeddings.Embedding at 0x7fe1e5f94160>,
 <keras.layers.embeddings.Embedding at 0x7fe1e5f94198>,
 <keras.layers.wrappers.Bidirectional at 0x7fe1e5f94358>,
 <keras.layers.wrappers.Bidirectional at 0x7fe16d10a438>,
 <keras.layers.wrappers.Bidirectional at 0x7fe16cf58470>,
 <keras.engine.topology.Merge at 0x7fe16d1110b8>,
 <keras.layers.core.Dropout at 0x7fe16cf58780>,
 <keras.layers.core.Dense at 0x7fe16ccab1d0>,
 <keras.layers.core.Dropout at 0x7fe16ccab748>,
 <keras.layers.core.Dense at 0x7fe16cca49b0>,
 <keras.layers.core.Dense at 0x7fe16cccec88>,
 <keras.layers.core.Dense at 0x7fe16ccd28d0>,
 <keras.layers.core.Dense at 0x7fe16ccc28d0>,
 <keras.layers.core.Dense at 0x7fe16cca4e80>,
 <keras.layers.core.Dense at 0x7fe16ccfcf60>,
 <keras.layers.core.Dense

In [36]:
model.fit([X_train, X_flex_train, X_train], y_train, validation_split=0.1,
          batch_size=batch_size, nb_epoch=epochs,
          verbose=2)

ValueError: Error when checking model input: expected input_4 to have shape (None, 11) but got array with shape (815884, 20)

In [None]:
model_path = ""
json_string = model.to_json()
open(os.path.join(model_path, "json_model"), "w").write(json_string)

In [515]:
one_hots = many_probs_to_one_hot(predicted)

In [544]:
predicted[7][0], one_hots[7][0]

(array([  1.74051713e-12,   9.99907136e-01,   9.10107519e-06,
          5.71419914e-05,   1.72988200e-06,   5.51274637e-10,
          1.09171277e-12,   4.06685899e-07,   3.89459203e-08,
          2.32082493e-05,   1.72790578e-07,   1.04286542e-07,
          7.78666674e-07,   2.22505481e-09], dtype=float32),
 array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [64]:
np.mean(pred_POS ==  y.argmax(axis=1))

0.97397359618743151


### SOME RESULTS:

#### первые шаги:
gikrya_train, вход: флексии + стеммы, выход: POS, словарь токенов max_size=2000, словарь флексий max_size=500, val_split=0.1, промежуточный результат:

>Train on 734295 samples, validate on 81589 samples

>Epoch 1/5: 2436s - loss: 0.3060 - acc: 0.9024 - val_loss: 0.1234 - val_acc: 0.9630

>Epoch 2/5: 2407s - loss: 0.1219 - acc: 0.9637 - val_loss: 0.1003 - val_acc: 0.9687

>Epoch 3/5: 2390s - loss: 0.1023 - acc: 0.9694 - val_loss: 0.0910 - val_acc: 0.9727

>Epoch 4/5: 2401s - loss: 0.0938 - acc: 0.9724 - val_loss: 0.0852 - val_acc: 0.9752

> Epoch 5/5: 2397s - loss: 0.0888 - acc: 0.9744 - val_loss: 0.0853 - val_acc: 0.9764

+

>Epoch 6: 2397s - loss: 0.0866 - acc: 0.9754 - val_loss: 0.0901 - val_acc: 0.9757

#### Замена LSTM на Simple RNN
> Epoch 2: 663s - loss: 0.1494 - acc: 0.9551 - val_loss: 0.1181 - val_acc: 0.9645

> Epoch 3: 664s - loss: 0.1340 - acc: 0.9598 - val_loss: 0.1116 - val_acc: 0.9674

+

> Epoch 4: 663s - loss: 0.1266 - acc: 0.9624 - val_loss: 0.1075 - val_acc: 0.9672


#### Замена GRU

>Train on 734295 samples, validate on 81589 samples

>Epoch 1/2: 1922s - loss: 0.2765 - acc: 0.9138 - val_loss: 0.1233 - val_acc: 0.9627

>Epoch 2/2: 1912s - loss: 0.1249 - acc: 0.9627 - val_loss: 0.0975 - val_acc: 0.9706

#### GRU all outputs:

Train on 734295 samples, validate on 81589 samples

> Epoch 3: 
2899s - loss: 0.7679 - dense_12_loss: 0.0605 - dense_13_loss: 0.1723 - dense_14_loss: 0.0552 - dense_15_loss: 0.0025 - dense_16_loss: 0.1316 - dense_17_loss: 0.0200 - dense_18_loss: 0.1102 - dense_19_loss: 0.1123 - dense_20_loss: 0.0242 - dense_21_loss: 0.0242 - dense_22_loss: 0.0117 - dense_23_loss: 0.0228 - dense_24_loss: 0.0203 - dense_12_acc: 0.9812 - dense_13_acc: 0.9423 - dense_14_acc: 0.9819 - dense_15_acc: 0.9993 - dense_16_acc: 0.9582 - dense_17_acc: 0.9940 - dense_18_acc: 0.9623 - dense_19_acc: 0.9659 - dense_20_acc: 0.9926 - dense_21_acc: 0.9925 - dense_22_acc: 0.9964 - dense_23_acc: 0.9932 - dense_24_acc: 0.9938 - val_loss: 0.6724 - val_dense_12_loss: 0.0527 - val_dense_13_loss: 0.1537 - val_dense_14_loss: 0.0475 - val_dense_15_loss: 0.0019 - val_dense_16_loss: 0.1114 - val_dense_17_loss: 0.0184 - val_dense_18_loss: 0.1004 - val_dense_19_loss: 0.0976 - val_dense_20_loss: 0.0211 - val_dense_21_loss: 0.0209 - val_dense_22_loss: 0.0090 - val_dense_23_loss: 0.0206 - val_dense_24_loss: 0.0173 - val_dense_12_acc: 0.9832 - val_dense_13_acc: 0.9469 - val_dense_14_acc: 0.9844 - val_dense_15_acc: 0.9995 - val_dense_16_acc: 0.9650 - val_dense_17_acc: 0.9945 - val_dense_18_acc: 0.9670 - val_dense_19_acc: 0.9697 - val_dense_20_acc: 0.9943 - val_dense_21_acc: 0.9937 - val_dense_22_acc: 0.9974 - val_dense_23_acc: 0.9941 - val_dense_24_acc: 0.9948

>Epoch 4:
2892s - loss: 0.7051 - dense_12_loss: 0.0549 - dense_13_loss: 0.1598 - dense_14_loss: 0.0513 - dense_15_loss: 0.0022 - dense_16_loss: 0.1193 - dense_17_loss: 0.0182 - dense_18_loss: 0.1010 - dense_19_loss: 0.1039 - dense_20_loss: 0.0221 - dense_21_loss: 0.0222 - dense_22_loss: 0.0109 - dense_23_loss: 0.0208 - dense_24_loss: 0.0184 - dense_12_acc: 0.9832 - dense_13_acc: 0.9467 - dense_14_acc: 0.9833 - dense_15_acc: 0.9994 - dense_16_acc: 0.9626 - dense_17_acc: 0.9946 - dense_18_acc: 0.9658 - dense_19_acc: 0.9685 - dense_20_acc: 0.9933 - dense_21_acc: 0.9933 - dense_22_acc: 0.9967 - dense_23_acc: 0.9940 - dense_24_acc: 0.9946 - val_loss: 0.6327 - val_dense_12_loss: 0.0487 - val_dense_13_loss: 0.1437 - val_dense_14_loss: 0.0461 - val_dense_15_loss: 0.0016 - val_dense_16_loss: 0.1043 - val_dense_17_loss: 0.0196 - val_dense_18_loss: 0.0904 - val_dense_19_loss: 0.0913 - val_dense_20_loss: 0.0180 - val_dense_21_loss: 0.0209 - val_dense_22_loss: 0.0085 - val_dense_23_loss: 0.0214 - val_dense_24_loss: 0.0182 - val_dense_12_acc: 0.9846 - val_dense_13_acc: 0.9505 - val_dense_14_acc: 0.9841 - val_dense_15_acc: 0.9996 - val_dense_16_acc: 0.9680 - val_dense_17_acc: 0.9944 - val_dense_18_acc: 0.9700 - val_dense_19_acc: 0.9720 - val_dense_20_acc: 0.9951 - val_dense_21_acc: 0.9938 - val_dense_22_acc: 0.9974 - val_dense_23_acc: 0.9941 - val_dense_24_acc: 0.9947


eval на gikrya_test:

> 149081 меток из 171550, точность 86.90%

> 8454 предложений из 20787, точность 40.67%

#### LSTM 
eval на gikrya_test:

> 149674 меток из 171550, точность 87.25%

> 8751 предложений из 20787, точность 42.10%


This is an example of using Hierarchical RNN (HRNN) to classify MNIST digits.
HRNNs can learn across multiple levels of temporal hiearchy over a complex sequence.
Usually, the first recurrent layer of an HRNN encodes a sentence (e.g. of word vectors)
into a  sentence vector. The second recurrent layer then encodes a sequence of
such vectors (encoded by the first layer) into a document vector. This
document vector is considered to preserve both the word-level and
sentence-level structure of the context.
# References
    - [A Hierarchical Neural Autoencoder for Paragraphs and Documents](https://arxiv.org/abs/1506.01057)
        Encodes paragraphs and documents with HRNN.
        Results have shown that HRNN outperforms standard
        RNNs and may play some role in more sophisticated generation tasks like
        summarization or question answering.
    - [Hierarchical recurrent neural network for skeleton based action recognition](http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7298714)
        Achieved state-of-the-art results on skeleton based action recognition with 3 levels
        of bidirectional HRNN combined with fully connected layers.
In the below MNIST example the first LSTM layer first encodes every
column of pixels of shape (28, 1) to a column vector of shape (128,). The second LSTM
layer encodes then these 28 column vectors of shape (28, 128) to a image vector
representing the whole image. A final Dense layer is added for prediction.
After 5 epochs: train acc: 0.9858, val acc: 0.9864

In [186]:
# # Training parameters.
# batch_size = 32
# num_classes = 10
# epochs = 5

# # Embedding dimensions.
# row_hidden = 128
# col_hidden = 129

# # The data, shuffled and split between train and test sets.
# (x_train, y_train), (x_test, y_test) = mnist.load_data()#"/home/users1/keras_datasets/")

# # Reshapes data to 4D for Hierarchical RNN.
# x_train = x_train.reshape(x_train.shape[0], 28, 28, 1)
# x_test = x_test.reshape(x_test.shape[0], 28, 28, 1)
# x_train = x_train.astype('float32')
# x_test = x_test.astype('float32')
# x_train /= 255
# x_test /= 255
# print('x_train shape:', x_train.shape)
# print(x_train.shape[0], 'train samples')
# print(x_test.shape[0], 'test samples')

# # Converts class vectors to binary class matrices.
# y_train = keras.utils.np_utils.to_categorical(y_train, num_classes)
# y_test = keras.utils.np_utils.to_categorical(y_test, num_classes)

# row, col, pixel = x_train.shape[1:]

# # 4D input.
# x = Input(shape=(row, col, pixel))

# # Encodes a row of pixels using TimeDistributed Wrapper.
# encoded_rows = TimeDistributed(LSTM(row_hidden))(x)

# # Encodes columns of encoded rows.
# encoded_columns = LSTM(col_hidden)(encoded_rows)

# Final predictions and model.
# prediction = Dense(num_classes, activation='softmax')(encoded_columns)
# model = Model(x, prediction)
# model.compile(loss='categorical_crossentropy',
#               optimizer='rmsprop',
#               metrics=['accuracy'])

# Training.
# model.fit(x_train, y_train,
#           batch_size=batch_size, nb_epoch=epochs,
#           verbose=2, validation_data=(x_test, y_test))

# # Evaluation.
# scores = model.evaluate(x_test, y_test, verbose=0)
# print('Test loss:', scores[0])
# print('Test accuracy:', scores[1])

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples




(TensorShape([Dimension(None), Dimension(28), Dimension(128)]),
 TensorShape([Dimension(None), Dimension(129)]))

array([[1, 2],
       [3, 4]])