# Витин код для Диалоговской морфологии (последовательный и работающий...)

In [1]:
import numpy as np
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim import corpora
from collections import defaultdict
import pickle
import pprint
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed, Embedding, Bidirectional, Merge
from keras.layers import LSTM, SimpleRNN, GRU, Dropout, RepeatVector
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import model_from_json
import os

Using TensorFlow backend.


 ## Сперва подготовим все для обучения

### 1) pickle со словарем всех категорий и их id

In [2]:
UNDEFINED = "_"
MAX_WORD_LENGTH = 20

def read_gikrya(path):
    """
    Читает гикря трайн/тест файл, вытаскивает всю инфу 
    записывает в словарик все категории и их значения (+ индексы для них генерируются)
    и предложения собирает
    Reading format:
    row_index<TAB>form<TAB>lemma<TAB>POS<TAB>tag
    return sentences: array of sentence arrays. each sentence array has this structure:
           [('рука', # token
           'рука', # lemma
           [('POS', 'NOUN'), # Gramms
            ('Animacy', 'Inan'),
            ('Case', 'Nom'),
            ('Gender', 'Fem'),
            ('Number', 'Sing')]), () ... () ]
    morpho_map: { ... 
    'Number': {0: '_', 1: 'Sing', 2: 'Plur', 'Plur': 2, 'Sing': 1, '_': 0}, ... }
    """
    
    morpho_map = {"POS":{UNDEFINED: 0, 
                         0: UNDEFINED}}
    sentences = []
    vocab = {}    
    with open(path, 'r') as f:
        
        sentence = []
        for line in f:
            splits = line.strip().split('\t')      
            if len(splits) == 4:
                splits.insert(0, 1)
            if len(splits) == 5:
                form, lemma, POS, tags = splits[1:]
                if POS not in  morpho_map["POS"]:
                    morpho_map["POS"][POS] = len(morpho_map["POS"]) // 2 
                    morpho_map["POS"][morpho_map["POS"][POS]] =  POS
                tags_list = [("POS", POS)]
                if tags != "_":
                    for tag_val in tags.split("|"):
                        tag, val = tag_val.split("=")
                        tags_list.append((tag, val))
                        if tag not in morpho_map:
                            morpho_map[tag] = {UNDEFINED: 0,
                                               0: UNDEFINED}
                        if val not in morpho_map[tag]:
                            morpho_map[tag][val] = len(morpho_map[tag]) // 2 
                            morpho_map[tag][morpho_map[tag][val]] = val
                if form not in vocab:
                    vocab[form] = form
                sentence.append((vocab[form], lemma, tags_list) )
            elif len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    return sentences, morpho_map


In [3]:
path_to_tagged = "../data/gikrya_train.txt"
path_to_write_morpho = "../models/morpho.pickle"
sentences_full, morpho_map = read_gikrya(path_to_tagged)

cat_order = sorted([key for key in morpho_map.keys()]) # sort categories
pickle.dump((morpho_map, cat_order), open(path_to_write_morpho, 'wb')) # save dict and categories

### 2)  стеммы и флексии 

Их нужно во-первых создать из тренировочных файлов

во-вторых сделать word2vec модели из данных большого корпуса (кажется новостного)

In [35]:
def split_word(word, stemmer):
    """Отрезает флексию от слова. 
    Возвращает стем и флексию"""
    flex = word[len(stemmer.stem(word)):]
    if len(flex):
        return word[:-len(flex)], flex
    return word, "empty"

def preproc_dataset(full_tag_sentences, stemmer):
    """ 
    :param full_tag_sentences: array of sentence arrays. each sentence array has this structure:
           [('рука', # token
           'рука', # lemma
           [('POS', 'NOUN'), # Gramms
            ('Animacy', 'Inan'),
            ('Case', 'Nom'),
            ('Gender', 'Fem'),
            ('Number', 'Sing')]), () ... () ]
    :param stemmer: Snowball Stemmer
    return: sentences - array of stem's sentences 
    flexes - array of flex's sentences
    token_tags - array of tags's sentences
    """
    sentences = []
    flexes = []
    token_tags = []
    tokens = []
    
    for sent in full_tag_sentences:
        temp_sent = []
        temp_flexes = []
        for token_info in sent:
            token = token_info[0].lower()
            tokens.append(token)
            splits = split_word(token, stemmer) # tuple of (stem, flex)
            temp_sent.append(splits[0])
            temp_flexes.append(splits[1])
            token_tags.append(token_info[2])  # надо бы переделать под стиль sentences или?          
        sentences.append(temp_sent)
        flexes.append(temp_flexes)    
    return sentences, flexes, token_tags, tokens

In [36]:
# Stemmer берем/ Обычный сноубол из nltk
# и из тренировочного датасета делаем наборы флексий, стемм и т.д.
stemmer = SnowballStemmer("russian")
sentences, flexes, token_tags, tokens = preproc_dataset(sentences_full, stemmer)
print(sentences[:1])
print(flexes[:1])
print(token_tags[:1])
print(tokens[:1])

[['чья-т', 'рук', 'легл', 'ем', 'на', 'плеч', '.']]
[['о', 'а', 'а', 'у', 'empty', 'о', 'empty']]
[[('POS', 'DET'), ('Case', 'Nom'), ('Gender', 'Fem'), ('Number', 'Sing')]]
['чья-то']


Берем тексты уже готовые для эмбеддингов стемм и флексий. Кажется это были новости Лента

In [6]:
def read_corpus(path):
    sentences = []
    with open(path, 'r') as f:
        for line in f:
            sentences.append(line.strip().lower().split())
    return sentences

def get_tokens(sentences):
    tokens = []
    for sent in sentences:
        for token in sent:
            tokens.append(token)
    return tokens

stem_path = '../for_embedding/allTexts_stemmas.txt'
flex_path = '../for_embedding/allTexts_flexias.txt'

stemmas = read_corpus(stem_path)
flexias = read_corpus(flex_path)

# array of tokens
stemmas = get_tokens(stemmas)
flexias = get_tokens(flexias)

Из флексий и стеммов сделать словари, перекодировать их в id

In [8]:
UNDEFINED_TOKEN = "undefined_token"

def build_vocab(sentences, min_freq=0, max_size=10000, undefined_id=0):
    """ 
    Строит словарь из слов встертившихся более min_freq раз,
    но размеров  не более max_size, в случае бОльшего количества токенов
    отбрасываются менее частотные токены, undefined_id - id первого токена в словаре,
    который будет называться "undefined_token"
    """
    offset = undefined_id
    token2id = {UNDEFINED_TOKEN: offset}
    id2token = {offset: UNDEFINED_TOKEN}    
    
    counter = defaultdict(int)    
    for sentence in sentences:
        for token in sentence:
            counter[token] += 1
    sorted_tokens = [t_f[0]  for t_f in 
                     sorted([t_f for t_f in counter.items() if t_f[1] >= min_freq],
                           key=lambda tf: -tf[1])]                     
    
    for token in sorted_tokens[:max_size - len(token2id)]:
        offset += 1
        token2id[token] = offset
        id2token[offset] = token
    return token2id, id2token 

# ну вообще-то скорее stem2id
token2id, id2token = build_vocab(stemmas, 
                                 min_freq=1,
                                 max_size=80000)
flex2id, id2flex = build_vocab(flexias, 
                               min_freq=2, 
                               max_size=500)

len(token2id), len(flex2id)

(1514, 26)

Тепероь обучить word2vec модели и сохранить их и словари:

In [12]:
def write_vecs(path, vecs_path, id2token, w2v_model):
    # косяк с тем чтон undefined token не 0
    vecs = np.zeros(shape=(len(token2id), w2v_model.vector_size))
    with open(path, 'w') as f:
        for tid in range(len(id2token)):
            vecs[tid, :] = w2v_model[id2token[tid]]
            f.write(id2token[tid])
            f.write("\n")
    np.save(vecs_path, vecs)

# ???
stem2stem = {}
for stem in token2id.keys():
    stem2stem[stem] = stem

flex2flex = {}
for flex in flex2id.keys():
    flex2flex[flex] = flex

new_sents = [[stem2stem.get(token, UNDEFINED_TOKEN) for token in sent] for sent in sentences]
new_flexes = [[flex2flex.get(token, UNDEFINED_TOKEN) for token in sent] for sent in flexes]
len(stem2stem), len(flex2flex)

(1514, 26)

In [14]:
#  сделать word2vec модели
stem_model = Word2Vec(new_sents, size=200, sg=1, workers=5, iter=10, min_count=1)
flex_model = Word2Vec(new_flexes, size=128, sg=1, workers=5, iter=10, min_count=1)

In [15]:
# записать модели
prefix = "../models"
write_vecs(os.path.join(prefix,"stem2id"),
           os.path.join(prefix, "stem_embeddings"),
           id2token, stem_model)

write_vecs(os.path.join(prefix, "flex2id"), 
           os.path.join(prefix, "flex_embeddings"),
           id2flex, flex_model)

KeyError: "word 'н' not in vocabulary"

## Само обучение модели (когда все эмбеддинги уже есть)

In [39]:
def chars_to_features(tokens, char2id):
    X = np.zeros(shape=(len(tokens), MAX_WORD_LENGTH), dtype=np.int)
    for idx, token in enumerate(tokens):
        for chid in range(min(MAX_WORD_LENGTH, len(token))):
            X[idx, -chid-1] = char2id.get(token[-chid-1], len(char2id))
    return X

def sentences_to_features(sentences, token2id, neighbors=3, undef_token="undefined_token"):
    arrays = [sentence_to_features(sent, token2id,  neighbors=neighbors,
                                  undef_token=undef_token) for sent in sentences]
    return np.vstack(arrays)

def sentence_to_features(sentence, token2id, 
                         neighbors=3, undef_token="undefined_token"):
    """
    Делает из предложения 
    матрицу id слов, где  строка соответствует словам предложения:
    в каждой строке состоит из neighbors id слов из левого контекста,
    потом id слова, затем neighbors id слов правого контекста
    0 - зарезерврован для паддинга, в словаре не должно быть слов с id 0
    """
    X = np.ones(shape=(len(sentence), neighbors * 2 + 1), dtype=np.int) * len(token2id)
    id_seq = np.zeros(shape=(len(sentence) + 2*neighbors,), dtype=np.int)
    for idx, token in enumerate(sentence):
        num = token2id.get(token, token2id[undef_token])
        id_seq[idx+neighbors] = num
    for idx in range(len(sentence)):
        X[idx, :] = id_seq[idx:idx + X.shape[1]]
    return X

def tagsets_to_one_hot(tagsets, morpho_map, cat_order):    
    # при частых запусках не оптимально так:
    # cats = set([cat for cat, val in tag2id.keys()])
    y = [np.zeros(shape=(len(tagsets), len(morpho_map[cat]) // 2), dtype=np.int) 
         for cat in cat_order]
    
    for one_hot in y:
        one_hot[:, 0] = 1       
        
    for idx, tagset in enumerate(tagsets):                    
        for cat, tag in tagset:
            # не очень эффективно индекс искать постоянно
            
            cat_id = cat_order.index(cat)    
            if cat_id >= 0:
                y[cat_id][idx, 0] = 0
                y[cat_id][idx, morpho_map[cat].get(tag, 0)] = 1            
    return y

def make_dataset(path, stemmer,
                      morpho_map, cat_order, undef_token,
                      token2id, flex2id, char2id,
                      neighbors=3):
    full_tag_sentences, _ = read_gikrya(path)    # up
    print('Done')
    sentences, flexes, token_tags, tokens = preproc_dataset(full_tag_sentences, stemmer) # up 
    print('Preprocessed')
    X_stem = sentences_to_features(sentences, token2id,
                                   neighbors=neighbors, undef_token=undef_token)
    print("Stems are ready")
    X_flex = sentences_to_features(flexes, flex2id, 
                                   neighbors=neighbors,
                                   undef_token=undef_token)
    print("Flexes are ready")
    X = chars_to_features(tokens, char2id)
    y = tagsets_to_one_hot(token_tags, morpho_map, cat_order)
    return X_stem, X_flex, X, y, sentences, flexes, token_tags, full_tag_sentences


def read_embeddings(vocab_path, emb_path):
    """Считываем файлы уже гтовых эмбеддингов
     return: 
     data2id - dictionary {char: id}
     vectors - 
     tokens - только undefined_token тут важен
     """
    token2id = {}
    tokens = open(vocab_path, "r").read().strip().split("\n")    
    for i, ch in enumerate(tokens):
        token2id[ch] = i
    vecs = np.load(emb_path)
    rnd_vec = np.random.uniform(size=vecs.shape[1]) 
    return token2id, np.vstack((vecs, rnd_vec / np.linalg.norm(rnd_vec))),  tokens[0]


In [50]:
# загружаем уже готовый pickle со словарем
morpho_map, cat_order =  pickle.load(open("../models/morpho.pickle", 'rb'))
# загружаем уже готовые эмбеддинги стемов, флексий и лемм 
token2id, token_vecs, undef_token = read_embeddings(os.path.join(prefix, "stem2id"),
                                      os.path.join(prefix, "stem_embeddings.npy"))

flex2id, flex_vecs, _ = read_embeddings(os.path.join(prefix, "flex2id"),
                                     os.path.join(prefix, "flex_embeddings.npy"))

char2id = {}
chars = open("char2id", "r").read().strip().split("\n")
for i, ch in enumerate(chars):
    char2id[ch] = i

stemmer = SnowballStemmer("russian")

neighbors = 3
gikrya_path = "../data/gikrya_train.txt"
X_stem_train, X_flex_train, X_train, y_train, sentences, flexes,\
token_tags, full_tag_sentences = make_dataset(gikrya_path, stemmer, 
                                            morpho_map, cat_order, undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

gikrya_test = "../data/gikrya_test.txt"
X_stem_test, X_flex_test, X_test, y_test, sentences, flexes,\
token_tags, full_tag_test = make_dataset(gikrya_test, stemmer, 
                                            morpho_map, cat_order,undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

# как-то это совмещается
X_stem, X_flex, X = np.vstack((X_stem_train, X_stem_test)),\
                                   np.vstack((X_flex_train, X_flex_test)),\
                                   np.vstack((X_train, X_test))

y = [np.vstack((train, test)) for train, test in zip(y_train, y_test)]

Done
Preprocessed
Stems are ready
Flexes are ready
Done
Preprocessed
Stems are ready
Flexes are ready


In [51]:
token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128

char_in = Input(shape=(X.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1,
                           output_dim=char2vec_dim)       

encoded_char = Bidirectional(LSTM(char_hidden,
                                   dropout_U=0.2, 
                                   dropout_W=0.2))(char_embedding(char_in))    


stem_in = Input(shape=(X_stem_train.shape[1],))
stem_embedding = Embedding(input_dim=token_vecs.shape[0],
                           output_dim=token_vecs.shape[1],
                           weights=[token_vecs])       

encoded_stem = LSTM(stem_hidden,
                    dropout_U=0.2, 
                    dropout_W=0.2)(stem_embedding(stem_in))

merged = keras.layers.merge([encoded_char, encoded_stem], mode='concat')
# merged = keras.layers.merge([encoded_char, encoded_flex], mode='concat')
pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1], 
            activation='softmax')(merged)

  
  name=name)


In [53]:
# компилим модель
model = Model([char_in, stem_in], pos_predict)
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
             metrics=['accuracy'])

test_fraction = 0.07
shuffled_indicies = np.arange(X.shape[0])
np.random.shuffle(shuffled_indicies)
split_index = int(X.shape[0] * (1 - test_fraction))
train = shuffled_indicies[:split_index]
test = shuffled_indicies[split_index:]

# Запустить модель двойная со стемами
batch_size = 256
epochs = 10
model.fit([X[train], X_stem[train]], y[cat_order.index('POS')][train],          
          validation_data=([X[test], X_stem[test]], y[cat_order.index('POS')][test]),          
          batch_size=batch_size,           
          nb_epoch=epochs, 
          verbose=2)



Train on 1010117 samples, validate on 76031 samples
Epoch 1/10


KeyboardInterrupt: 

## Варианты моделей

#### Было несколько её модификаций. Просто со стемами была лучше
результаты: лучший за 10 эпох 0.9874

In [42]:
token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128

char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1,
                           output_dim=char2vec_dim)       

encoded_char = Bidirectional(LSTM(char_hidden,
                                   dropout_U=0.2, 
                                   dropout_W=0.2))(char_embedding(char_in))    


stem_in = Input(shape=(X_stem_train.shape[1],))
stem_embedding = Embedding(input_dim=token_vecs.shape[0],
                           output_dim=token_vecs.shape[1],
                           weights=[token_vecs])       

encoded_stem = LSTM(stem_hidden,
                    dropout_U=0.2, 
                    dropout_W=0.2)(stem_embedding(stem_in))

merged = keras.layers.merge([encoded_char, encoded_stem], mode='concat')
pos_predict = Dense(output_dim=y_train[cat_order.index('POS')].shape[1], 
            activation='softmax')(merged)

  
  name=name)


#### Но были и другие версии:

In [54]:
# это для примера, не запускать!
# 1) Простая модификация. результаты: лучший за 20 эпох 0.976
token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512

char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1,
                           output_dim=char2vec_dim)

encoded_char = Bidirectional(LSTM(char_hidden, 
                                  dropout_U=0.2, 
                                  dropout_W=0.2))(char_embedding(char_in))

pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1], 
                    activation='softmax',)(encoded_char)

  


In [55]:
# пример с флексиями! не запускать
# результаты: лучший за 10 эпох 0.9773
token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128
char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1, output_dim=char2vec_dim)

encoded_char = Bidirectional(LSTM(char_hidden, dropout_U=0.2, dropout_W=0.2))(char_embedding(char_in))

flex_in = Input(shape=(X_flex_train.shape[1],))
flex_embedding = Embedding(input_dim=flex_vecs.shape[0], 
                           output_dim=flex_vecs.shape[1], weights=[flex_vecs])
encoded_flex = LSTM(char_hidden, 
                    dropout_U=0.2, 
                    dropout_W=0.2)(flex_embedding(flex_in))

merged = keras.layers.merge([encoded_char, encoded_flex], 
                            mode='concat') 

pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1], activation='softmax')(merged)


  if sys.path[0] == '':
  name=name)


In [None]:
# готовим всё для fit модели
test_fraction = 0.07
shuffled_indicies = np.arange(X.shape[0])
np.random.shuffle(shuffled_indicies)
split_index = int(X.shape[0] * (1 - test_fraction))
train = shuffled_indicies[:split_index]
test = shuffled_indicies[split_index:]

# fit для варианта с флексиями:
batch_size = 256
epochs = 10
model.fit([X[train], X_flex[train]], y[cat_order.index('POS')][train],          
          validation_data=([X[test], X_flex[test]], y[cat_order.index('POS')][test]),          
          batch_size=batch_size,           
          nb_epoch=epochs, 
          verbose=2)

# fit кажется для простого варианта
# batch_size = 256
# epochs = 10
# model.fit(X[train], y[cat_order.index('POS')][train],          
#          validation_data=(X[test], y[cat_order.index('POS')][test]),          
#          batch_size=batch_size, nb_epoch=epochs, 
#          verbose=2)
