In [5]:
import numpy as np
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim import corpora
from collections import defaultdict
import pickle
import os
import pprint

In [6]:
UNDEFINED = "_"
MAX_WORD_LENGTH = 20


def read_gikrya(path):
    """
    Reading format:
    row_index<TAB>form<TAB>lemma<TAB>POS<TAB>tag
    """
    morpho_map = {"POS":{UNDEFINED: 0, 
                         0: UNDEFINED}}
    
    
    sentences = []
    vocab = {}    
    with open(path, 'r') as f:
        sentence = []
        for line in f:
            splits = line.strip().split('\t')            
            if len(splits) == 5:
                form, lemma, POS, tags = splits[1:]
                if POS not in  morpho_map["POS"]:
                    morpho_map["POS"][POS] = len(morpho_map["POS"]) // 2 
                    morpho_map["POS"][morpho_map["POS"][POS]] =  POS
                tags_list = [("POS", POS)]
                if tags != "_":
                    for tag_val in tags.split("|"):
                        tag, val = tag_val.split("=")
                        tags_list.append((tag, val))
                        if tag not in morpho_map:
                            morpho_map[tag] = {UNDEFINED: 0,
                                               0: UNDEFINED}
                        if val not in morpho_map[tag]:
                            morpho_map[tag][val] = len(morpho_map[tag]) // 2 
                            morpho_map[tag][morpho_map[tag][val]] = val
#                 else:
#                     tags_list.append(tags)
                if form not in vocab:
                    vocab[form] = form
                sentence.append((vocab[form], lemma, tags_list) )
                
                    
            elif len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    pprint.pprint(morpho_map)
    return sentences, morpho_map       
 

def split_word(word, stemmer):
    flex = word[len(stemmer.stem(word)):]
    if len(flex):
        return word[:-len(flex)], flex
    return word, "empty"


def sentences_to_features(sentences, token2id, neighbors=3, undef_token="undefined_token"):
    arrays = [sentence_to_features(sent, token2id,  neighbors=neighbors,
                                  undef_token=undef_token) for sent in sentences]
    return np.vstack(arrays)


def sentence_to_features(sentence, token2id, 
                         neighbors=3, undef_token="undefined_token"):
    """
    Делает из предложения 
    матрицу id слов, где  строка соответствует словам предложения:
    в каждой строке состоит из neighbors id слов из левого контекста,
    потом id слова, затем neighbors id слов правого контекста
    0 - зарезерврован для паддинга, в словаре не должно быть слов с id 0
    """
    X = np.ones(shape=(len(sentence), neighbors * 2 + 1), dtype=np.int) * len(token2id)
    id_seq = np.zeros(shape=(len(sentence) + 2*neighbors,), dtype=np.int)
    for idx, token in enumerate(sentence):
        num = token2id.get(token, token2id[undef_token])
        # assert num != 0
        id_seq[idx+neighbors] = num
    for idx in range(len(sentence)):
        X[idx, :] = id_seq[idx:idx + X.shape[1]]
    return X   

        
def build_vocab(sentences, min_freq=0, max_size=10000, undefined_id=0):
    """ 
    Строит словарь из слов встертившихся более min_freq раз,
    но размеров  не более max_size, в случае бОльшего количества токенов
    отбрасываются менее частотные токены, undefined_id - id первого токена в словаре,
    который будет называться "undefined_token"
    """
    offset = undefined_id
    token2id = {"undefined_token": offset}
    id2token = {offset: "undefined_token"}    
    
    counter = defaultdict(int)    
    for sentence in sentences:
        for token in sentence:
            counter[token] += 1
    sorted_tokens = [t_f[0]  for t_f in 
                     sorted([t_f for t_f in counter.items() if t_f[1] >= min_freq],
                           key=lambda tf: -tf[1])]                     
    
    for token in sorted_tokens[:max_size - len(token2id)]:
        offset += 1
        token2id[token] = offset
        id2token[offset] = token
    return token2id, id2token

def build_morpho_vocab(morpho_map):
    morpho_сats = sorted([key for key in morpho_map.keys()])
    # чисто для удобства POS сделаем первым
    morpho_сats.insert(0, morpho_сats.pop(morpho_сats.index("POS"))) 
    abs_idx = 0
    tag2id = {}
    id2tag = {}
    for cat in morpho_сats:
        vals = [pair[0] for pair in sorted(list(morpho_map[cat].items()), 
                                           key=lambda p: p[1])]
        for val in vals:
            tag2id[(cat, val)] = abs_idx
            id2tag[abs_idx] = (cat, val)
            abs_idx += 1
    return tag2id, id2tag  


def tagsets_to_one_hot(tagsets, morpho_map, cat_order):    
    # при частых запусках не оптимально так:
    # cats = set([cat for cat, val in tag2id.keys()])
    y = [np.zeros(shape=(len(tagsets), len(morpho_map[cat]) // 2), dtype=np.int) 
         for cat in cat_order]
    
    for one_hot in y:
        one_hot[:, 0] = 1       
        
    for idx, tagset in enumerate(tagsets):                    
        for cat, tag in tagset:
            # не очень эффективно индекс искать постоянно
            
            cat_id = cat_order.index(cat)    
            if cat_id >= 0:
                y[cat_id][idx, 0] = 0
                y[cat_id][idx, morpho_map[cat].get(tag, 0)] = 1            
    return y
        
    
def preproc_dataset(full_tag_sentences, stemmer):    
    sentences = []
    flexes = []
    token_tags = []
    tokens = []
    
    for sent in full_tag_sentences:
        temp_sent = []
        temp_flexes = []
        for token_info in sent:
            token = token_info[0].lower()
            tokens.append(token)
            splits = split_word(token, stemmer)
            temp_sent.append(splits[0])
            temp_flexes.append(splits[1])
            token_tags.append(token_info[2])  # надо бы переделать под стиль sentences или?          
        sentences.append(temp_sent)
        flexes.append(temp_flexes)    
    return sentences, flexes, token_tags, tokens
    

       
def make_dataset(path, stemmer,
                      morpho_map, cat_order, undef_token,
                      token2id, flex2id, char2id,
                      neighbors=3):
    """Start here. """
    full_tag_sentences, _ = read_gikrya(path)    
    sentences, flexes, token_tags, tokens = preproc_dataset(full_tag_sentences, stemmer)
    X_stem = sentences_to_features(sentences, token2id,
                                   neighbors=neighbors, undef_token=undef_token)
    X_flex = sentences_to_features(flexes, flex2id, 
                                   neighbors=neighbors,
                                   undef_token=undef_token)
    X = chars_to_features(tokens, char2id)
    y = tagsets_to_one_hot(token_tags, morpho_map, cat_order)
    return X_stem, X_flex, X, y, sentences, flexes, token_tags, full_tag_sentences 


def chars_to_features(tokens, char2id):
    X = np.zeros(shape=(len(tokens), MAX_WORD_LENGTH), dtype=np.int)
    for idx, token in enumerate(tokens):
        for chid in range(min(MAX_WORD_LENGTH, len(token))):
            X[idx, -chid-1] = char2id.get(token[-chid-1], len(char2id))
    return X


def add_tags_to_sentences(full_tag_sentences, y, morpho_map, cat_order):
    new_full_tag_sents = []
    idx = 0
    for full_tag_sent in full_tag_sentences:
        new_full_tag = []   
        for token_info in full_tag_sent:
            tags = []
            for cat, oh_val in zip(cat_order, y):
                
                ntag = oh_val.shape[1]
                tags.append((cat,
                            [morpho_map[cat][i] for i in range(ntag) if oh_val[idx, i]==1][0]))
            new_full_tag.append((token_info[0],
                                '_',
                                tags))
            idx += 1
        new_full_tag_sents.append(new_full_tag)
    return new_full_tag_sents


def probs_to_one_hot(probs):
    one_hot = np.zeros_like(probs, dtype=np.int)
    for row in range(one_hot.shape[0]):
        one_hot[row, np.argmax(probs[row, :])] =1
    return one_hot


def many_probs_to_one_hot(probs):
    return [probs_to_one_hot(prob) for prob in probs]


def write_gikrya(path, full_tags):
    with open(path, 'w') as f:
        idx = 0
        for sentence in full_tags:
            for i, token_info in enumerate(sentence):
                f.write("{}\t{}\t{}\t{}\n".format(i+1,
                                                token_info[0],
                                                token_info[1],
                                                tagset2str(token_info[2])))
            f.write("\n")
            
                    
def tagset2str(tagset):
    POS = ""
    tags_list = []
    for tag, val in tagset:
        if  tag == "POS":
            POS = val
        else:
            if val != UNDEFINED:
                tags_list.append("{}={}".format(tag, val))
    tags = "_"
    if len(tags_list) > 0:
        tags = "|".join(tags_list)
    return "{}\t{}".format(POS, tags)
        
    
def read_embeddings(vocab_path, emb_path):    
    token2id = {}
    tokens = open(vocab_path, "r").read().strip().split("\n")    
    for i, ch in enumerate(tokens):
        token2id[ch] = i
    vecs = np.load(emb_path)
    rnd_vec = np.random.uniform(size=vecs.shape[1]) 
    return token2id, np.vstack((vecs, rnd_vec / np.linalg.norm(rnd_vec))),  tokens[0]

In [4]:
predicted = model.predict([X_stem_test, X_flex_test, X_test])
one_hots = many_probs_to_one_hot(predicted)
test = add_tags_to_sentences(full_tag_test, one_hots, morpho_map, cat_order)
write_gikrya("../morphoRuEval-2017/Baseline/predict/gikrya_test_nnet2.txt", test)

NameError: name 'model' is not defined

In [None]:
# model = model_from_json(open('../models/model_3/model_json.arch', 'r').read())
model.load_weights('../models/model_4/weights.model')

In [3]:
prefix = "../morphoRuEval-2017/test_collection/" 
# tests = ["VK.txt"]#, "JZ.txt", "Lenta.txt"]
tests = ["JZ.txt", "Lenta.txt"]

for p in tests:
    path = os.path.join(prefix, p)
    X_stem_test, X_flex_test, X_test, y_test, sentences, flexes,\
    token_tags, full_tag_test = make_dataset(path, stemmer, 
                                                morpho_map, cat_order,undef_token, 
                                                token2id, flex2id, char2id, neighbors=neighbors)
    predicted = model.predict([X_stem_test, X_flex_test, X_test], batch_size=512)
    one_hots = many_probs_to_one_hot(predicted)
    test = add_tags_to_sentences(full_tag_test, one_hots, morpho_map, cat_order)
    write_gikrya("../morphoRuEval-2017/test_collection/tagged/{}".format(p), test)

NameError: name 'stemmer' is not defined

In [3]:
prefix = "../models/"
morpho_map, cat_order =  pickle.load(open("../models/morpho.pickle", 'rb'))
token2id, token_vecs, undef_token = read_embeddings(os.path.join(prefix, "stem2id"),
                                      os.path.join(prefix, "stem_embeddings.npy"))

flex2id, flex_vecs, _ = read_embeddings(os.path.join(prefix, "flex2id"),
                                     os.path.join(prefix, "flex_embeddings.npy"))

# lemm2id, lemm_vecs, _ = read_embeddings(os.path.join(prefix, "lemm2id"),
#                                      os.path.join(prefix, "lemm_embeddings.npy"))

char2id = {}
chars = open("char2id", "r").read().strip().split("\n")
for i, ch in enumerate(chars):
    char2id[ch] = i

stemmer = SnowballStemmer("russian")

In [4]:
char2id
id2char = {chid:ch for ch, chid in char2id.items()}
id2char[0]

'undefined_token'

In [5]:
neighbors = 3
gikrya_path = "../morphoRuEval-2017/Baseline/source/gikrya_train.txt"
# gikrya_path = "../JointMorphoClosed.txt"
X_stem_train, X_flex_train, X_train, y_train, sentences, flexes,\
token_tags, full_tag_sentences=make_dataset(gikrya_path, stemmer, 
                                            morpho_map, cat_order, undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

In [6]:
gikrya_test = "../morphoRuEval-2017/Baseline/source/gikrya_test.txt"
X_stem_test, X_flex_test, X_test, y_test, sentences, flexes,\
token_tags, full_tag_test = make_dataset(gikrya_test, stemmer, 
                                            morpho_map, cat_order,undef_token, 
                                            token2id, flex2id, char2id, neighbors=neighbors)

In [7]:
X_stem, X_flex, X = np.vstack((X_stem_train, X_stem_test)),\
                                   np.vstack((X_flex_train, X_flex_test)),\
                                   np.vstack((X_train, X_test))

y = [np.vstack((train, test)) for train, test in zip(y_train, y_test)]

# A PLAN:

- запилить POS tag only, вход - слово, выход - POS
- запилить POS tag only, вход - последовательность слов, выход - последовательность POS

In [8]:
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Model
from keras.layers import Input, Dense, TimeDistributed, Embedding, Bidirectional, Merge
from keras.layers import LSTM, SimpleRNN, GRU, Dropout, RepeatVector
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import model_from_json
import os

Using Theano backend.
ERROR (theano.gpuarray): Could not initialize pygpu, support disabled
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/theano/gpuarray/__init__.py", line 164, in <module>
    use(config.device)
  File "/usr/local/lib/python3.5/dist-packages/theano/gpuarray/__init__.py", line 151, in use
    init_dev(device)
  File "/usr/local/lib/python3.5/dist-packages/theano/gpuarray/__init__.py", line 60, in init_dev
    sched=config.gpuarray.sched)
  File "pygpu/gpuarray.pyx", line 614, in pygpu.gpuarray.init (pygpu/gpuarray.c:9419)
  File "pygpu/gpuarray.pyx", line 566, in pygpu.gpuarray.pygpu_init (pygpu/gpuarray.c:9110)
  File "pygpu/gpuarray.pyx", line 1021, in pygpu.gpuarray.GpuContext.__cinit__ (pygpu/gpuarray.c:13472)
pygpu.gpuarray.GpuArrayException: Unknown device error: -1


In [9]:
token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128

char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1,
                           output_dim=char2vec_dim)       

encoded_char = Bidirectional(LSTM(char_hidden,
                                   dropout_U=0.2, 
                                   dropout_W=0.2))(char_embedding(char_in))    


stem_in = Input(shape=(X_stem_train.shape[1],))
stem_embedding = Embedding(input_dim=token_vecs.shape[0],
                           output_dim=token_vecs.shape[1],
                           weights=[token_vecs])       

encoded_stem = LSTM(stem_hidden,
                    dropout_U=0.2, 
                    dropout_W=0.2)(stem_embedding(stem_in))    


# flex_in = Input(shape=(X_flex_train.shape[1],))
# flex_embedding = Embedding(input_dim=flex_vecs.shape[0],
#                            output_dim=flex_vecs.shape[1],
#                            weights=[flex_vecs])       

# encoded_flex = LSTM(char_hidden,
#                     dropout_U=0.2, 
#                     dropout_W=0.2)(flex_embedding(flex_in))    


merged = keras.layers.merge([encoded_char, encoded_stem], mode='concat')
# merged = keras.layers.merge([encoded_char, encoded_flex], mode='concat')
pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1], 
            activation='softmax')(merged)

In [11]:
model = Model([char_in, stem_in], pos_predict)
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
             metrics=['accuracy'])

###  model.fit  происходит здесь:

In [12]:
test_fraction = 0.07
shuffled_indicies = np.arange(X.shape[0])
np.random.shuffle(shuffled_indicies)
split_index = int(X.shape[0] * (1 - test_fraction))
train = shuffled_indicies[:split_index]
test = shuffled_indicies[split_index:]

In [None]:
batch_size = 256
epochs = 10
model.fit([X[train], X_flex[train]], y[cat_order.index('POS')][train],          
          validation_data=([X[test], X_flex[test]], y[cat_order.index('POS')][test]),          
          batch_size=batch_size,           
          nb_epoch=epochs, 
          verbose=2)

Train on 1010117 samples, validate on 76031 samples
Epoch 1/10


In [58]:
batch_size = 256
epochs = 10
model.fit(X[train], y[cat_order.index('POS')][train],          
          validation_data=(X[test], y[cat_order.index('POS')][test]),          
          batch_size=batch_size, nb_epoch=epochs, 
          verbose=2)

Train on 1010117 samples, validate on 76031 samples
Epoch 1/10
547s - loss: 0.0855 - acc: 0.9664 - val_loss: 0.0887 - val_acc: 0.9660
Epoch 2/10
548s - loss: 0.0856 - acc: 0.9663 - val_loss: 0.0886 - val_acc: 0.9664
Epoch 3/10
549s - loss: 0.0858 - acc: 0.9664 - val_loss: 0.0877 - val_acc: 0.9663
Epoch 4/10
549s - loss: 0.0859 - acc: 0.9664 - val_loss: 0.0908 - val_acc: 0.9670
Epoch 5/10
549s - loss: 0.0862 - acc: 0.9663 - val_loss: 0.0901 - val_acc: 0.9668
Epoch 6/10
548s - loss: 0.0863 - acc: 0.9661 - val_loss: 0.0879 - val_acc: 0.9676
Epoch 7/10
548s - loss: 0.0871 - acc: 0.9661 - val_loss: 0.0895 - val_acc: 0.9667
Epoch 8/10
548s - loss: 0.0874 - acc: 0.9660 - val_loss: 0.0909 - val_acc: 0.9636
Epoch 9/10
548s - loss: 0.0873 - acc: 0.9662 - val_loss: 0.0952 - val_acc: 0.9635
Epoch 10/10
548s - loss: 0.0878 - acc: 0.9659 - val_loss: 0.0928 - val_acc: 0.9629


<keras.callbacks.History at 0x7fc6369e32b0>

## A PLAN:
- ~~запилить POS tag only, вход - слово, выход - POS~~
- попробовать помимо самого слова скармливать весь контекст посимвольно
- потестить гиперпараметры
- придумать способ сэмплирования последовательности, чтобы сэмпл был из одного предложения и тэгги сразу проставлялись для всей последовательности
- запилить POS tag only, вход - последовательность слов, выход - последовательность POS
- потестить гиперпараметры
- сравнить подходы


## Results:
- простой вариант:

> token_hidden = 128

> flex_hidden = 64

> char2vec_dim = 42

> char_hidden = 512

> char_in = Input(shape=(X_train.shape[1],))

> char_embedding = Embedding(input_dim=len(char2id) + 1,                           
>                            output_dim=char2vec_dim)       
encoded_char = Bidirectional(LSTM(char_hidden,
>                            dropout_U=0.2, 
>                            dropout_W=0.2))(char_embedding(char_in))    

> pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1], 
>                     activation='softmax',)(encoded_char)

результаты: лучший за 20 эпох 0.976

- со стеммами:

> token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128
char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1, output_dim=char2vec_dim)       
encoded_char = Bidirectional(LSTM(char_hidden,                                   dropout_U=0.2, dropout_W=0.2))(char_embedding(char_in))    
stem_in = Input(shape=(X_stem_train.shape[1],))
stem_embedding = Embedding(input_dim=token_vecs.shape[0], output_dim=token_vecs.shape[1],  weights=[token_vecs])       
encoded_stem = LSTM(stem_hidden, dropout_U=0.2, dropout_W=0.2)(stem_embedding(stem_in))    

результаты: лучший за 10 эпох 0.9874

- с флексиями:

> token_hidden = 128
flex_hidden = 64
char2vec_dim = 42
char_hidden = 512
stem_hidden = 128
flex_hidden = 128
char_in = Input(shape=(X_train.shape[1],))
char_embedding = Embedding(input_dim=len(char2id) + 1,                           output_dim=char2vec_dim)       
encoded_char = Bidirectional(LSTM(char_hidden,                                   dropout_U=0.2,  dropout_W=0.2))(char_embedding(char_in))    
flex_in = Input(shape=(X_flex_train.shape[1],))
flex_embedding = Embedding(input_dim=flex_vecs.shape[0],                           output_dim=flex_vecs.shape[1], weights=[flex_vecs])     
encoded_flex = LSTM(char_hidden, dropout_U=0.2, dropout_W=0.2)(flex_embedding(flex_in))    
merged = keras.layers.merge([encoded_char, encoded_flex], mode='concat')
pos_predict = Dense(output_dim=y[cat_order.index('POS')].shape[1],             activation='softmax')(merged)

результаты: лучший за 10 эпох 0.9773

In [None]:
model.load_weights()