In [1]:
import numpy as np

In [2]:
# np.random.randint(0, high=10)
# !export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
# !export CUDA_HOME=/usr/local/cuda
# import tensorflow as tf

In [3]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import tensorflow as tf
import numpy as np
import random
import pprint
from collections import Counter, defaultdict


class Evaluation(object):

    def __init__(self, golden_set, system_set):
        """Format of golden and system_set: array with little arrays of on length for every word
        Like [['plen', 'inan', 'ipf', 'nom', 'comp', 'm', 'indic', 'sg', 'persn', 'S-PRO', '3p',
        'praet', 'tran', 'ger', 'act'], 
        ['plen', 'inan', 'ipf', 'nom', 'comp', 'm', 'indic', 'sg', 'persn', 'S', '1p', 
        'praet', 'intr', 'ger', 'act'],...]
        """
        self.golden_set = golden_set
        self.predicted_set = system_set
        self.tags_dictionary = defaultdict(int)
        self.golden_tags_dict = defaultdict(int)
        self.all_number = len(golden_set)
        self.accuracy_real_tag = 0

        for one_array in golden_set:
            for one_tag in one_array:
                self.golden_tags_dict[one_tag] += 1

    def count_accuracy(self):
        for gold, system in zip(self.golden_set, self.predicted_set):
            one_word_set = []
            for i in range(len(gold)):
                if gold[i] == system[i]:
                    self.tags_dictionary[gold[i]] += 1
                    one_word_set.append(1)
                else:
                    if gold[i] == 'UNDEFINED':
                        one_word_set.append(1)
            if len(one_word_set) == len(gold):
                self.accuracy_real_tag += 1

        print("ALL tags: ", self.accuracy_real_tag, self.all_number, self.accuracy_real_tag/self.all_number)

        for key, value in sorted(self.tags_dictionary.items()):
            real_tag_number = self.golden_tags_dict.get(key)
            print("One tag accuracy: ", key, value, real_tag_number, value/real_tag_number)

#=====================

GRADIENT_CLIP_NORM = 15.0

In [60]:
def make_projection(tensor, output_dim=None, input_dim=None):
    if input_dim is None:
        input_dim = tensor.get_shape()[-1].value
    if output_dim is None:
        output_dim = input_dim
    s = tf.shape(tensor)
    rank = len(tensor.get_shape())
    assert rank in (2,3)
    if rank == 2:
        inshape = tf.pack([s[0], input_dim])
        outshape = tf.pack([s[0], output_dim])
    elif rank == 3:
        inshape = tf.pack([s[0] * s[1], input_dim])
        outshape = tf.pack([s[0], s[1], output_dim])
    flat_tensor = tf.reshape(tensor, inshape)
    proj = tf.Variable(tf.truncated_normal([input_dim, output_dim], dtype=tf.float32, stddev=0.1))
    bias = tf.Variable(tf.zeros([output_dim], dtype=tf.float32))
    projected_tensor = tf.matmul(flat_tensor, proj) + bias
    result = tf.reshape(projected_tensor, outshape)
    if rank == 2:
        result.set_shape([None, output_dim])
    elif rank == 3:
        result.set_shape([None, None, output_dim])
    return result

def bidirectional_dynamic_rnn(cell_forward, initial_state_forward, cell_backward, initial_state_backward, inputs, sequence_lengths):
    with tf.variable_scope("forward"):
        forward, fw_state = tf.nn.dynamic_rnn(cell=cell_forward, inputs=inputs, sequence_length=sequence_lengths, initial_state=initial_state_forward,
                                dtype=tf.float32, time_major=True, swap_memory=True)
    with tf.variable_scope("backward"):
        sequence_lengths64 = tf.cast(sequence_lengths, dtype=tf.int64) # weird requirement of tf.reverse_sequence
        inputs_reversed = tf.reverse_sequence(input=inputs, seq_lengths=sequence_lengths64, seq_dim=0, batch_dim=1)
        backward_reversed, bw_state = tf.nn.dynamic_rnn(cell=cell_backward, inputs=inputs_reversed, sequence_length=sequence_lengths, initial_state=initial_state_backward,
                                dtype=tf.float32, time_major=True, swap_memory=True)
        backward = tf.reverse_sequence(input=backward_reversed, seq_lengths=sequence_lengths64, seq_dim=0, batch_dim=1)
    output = tf.concat(2, [forward, backward])
    return output, fw_state, bw_state

def get_optimizer(optimizer, loss):
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, GRADIENT_CLIP_NORM)
    return optimizer.apply_gradients(zip(gradients, v))

def compute_masked_multihead_loss(bottleneck, target, mask):
    masks = tf.unpack(tf.transpose(mask, perm=(2, 0, 1)))
    targets = tf.unpack(tf.transpose(target, perm=(2, 0, 1)))
    assert len(cats) == len(masks)
    result = [] # will be list of triples (mean cross-entropy, number of correct top-1 choices, number of valid examples)
    logits = {}
    for i,(cat, num_targets) in enumerate(catlens):
        logits_i = make_projection(bottleneck, output_dim=num_targets)
        mask_i = masks[i]
        target_i = targets[i]
        xent_i = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_i, labels=target_i)
        correct_i = tf.nn.in_top_k(predictions=tf.reshape(logits_i, [-1, num_targets]), targets=tf.reshape(target_i, [-1]), k=1)
        xent_i = tf.reduce_mean(tf.boolean_mask(xent_i, mask_i))
        correct_i = tf.reduce_sum(tf.cast(tf.boolean_mask(correct_i, tf.reshape(mask_i, [-1])), tf.int32))
        n_i = tf.reduce_sum(tf.cast(mask_i, tf.int32))
        xent_i = tf.select(tf.equal(n_i, 0), 0.0, xent_i)
        result.append((xent_i, correct_i, n_i))
        logits[cat] = tf.exp(logits_i) / tf.reduce_sum(tf.exp(logits_i), keep_dims=True, reduction_indices=2)
    return result, logits

#=============================================================

def word2vec(word):
    """Transform word into a fixed-sized int32 vector (MAX_WORD_LENGTH) with values
    up to len(charmap)"""
    result = np.zeros([MAX_WORD_LENGTH], dtype=np.int32)
    word = [charmap["BEGIN"]] + list(word) + [charmap["END"]]
    if len(word) > MAX_WORD_LENGTH:
        prefix, suffix = word[:MAX_PS_LENGTH], word[-MAX_PS_LENGTH:]
        word = prefix + suffix
    for i, c in enumerate(word):
        result[i] = charmap[c] if c in charmap else charmap["UNKNOWN"]
    return result


def sentence2example(sentence):
    """Take a sentence: [(word, feats)] and stack these vectors into two matrices, correspondingly"""
    ws, fs = list(zip(*sentence))
    return np.stack(ws), np.stack(fs)


def make_dataset(fn):
    dataset = []
    print(fn)
    with open(fn, 'r') as f:
        sentence = []
        for l in f:
            l = l.strip().split()
            assert len(l) == len(cats) + 1
            if l[0] == "SENTENCE":
                assert cats == tuple(l[1:])  # check that the ordering is the same
                if len(sentence):
                    dataset.append(sentence2example(sentence))
                sentence = []
            else:
                word = word2vec(l[0])
                feats = np.asarray(list(map(int, l[1:])), dtype=np.int32)
                sentence.append((word, feats))
    if len(sentence):
        dataset.append(sentence2example(sentence))  # the last example has no closing tag
    # print(dataset)
    return dataset


def read_gikrya(path):
    """
    Reading format:
    row_index<TAB>form<TAB>lemma<TAB>POS<TAB>tag"""
    tags_map = {}
    POS_map = {}
    sentences = []
    with open(path, 'r') as f:
        sentence = []
        for line in f:
            splits = line.strip().split('\t')            
            if len(splits) == 5:
                form, lemma, POS, tags = splits[1:]
                if POS not in POS_map:
                    POS_map[POS] = len(POS_map) 
                tags_list = []
                if tags != "_":
                    for tag_val in tags.split("|"):
                        tag, val = tag_val.split("=")
                        tags_list.append((tag, val))
                        if tag not in tags_map:
                            tags_map[tag] = {}
                        if val not in tags_map[tag]:
                            tags_map[tag][val] = len(tags_map[tag])                            
                else:
                    tags_list.append(tags)
                sentence.append((form, lemma, POS, tags_list) )
            elif len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
    return sentences, POS_map, tags_map       
                    
    
def build_vocab(gikrya_sents, max_words=None, min_freq=None):
    

In [61]:
path = "../morphoRuEval-2017/Baseline/source/gikrya_train.txt"
sentences, POS_map, tags_map = read_gikrya(path)

In [58]:
# sentences[-1]
devset[0][0].shape, devset[0][1].shape


((18, 20), (18, 15))

In [5]:
MAX_WORDS_IN_BATCH = 200


# Actually this is *not* a real upper bound -- if a sentence is longer than MAX_WORDS_IN_BATCH,
# a 1-example minibatch will be formed solely by this sentence to avoid data loss.
# Be aware of this if you face OOM errors.

def stack_and_mask(examples_list):
    """CONVENTION to save memory: mask (i.e. valid inputs) is equivalent to non-zero targets."""

    def pad(x, to_length):
        if x.shape[0] == to_length: return x
        return np.pad(x, pad_width=((0, to_length - x.shape[0]), (0, 0)), mode='constant', constant_values=0)

    feats, targets = list(zip(*examples_list))
    sequence_lengths = list(map(lambda x: x.shape[0], feats))
    maxlen = max(sequence_lengths)
    feats, targets = list(map(lambda mats: np.stack([pad(x, maxlen) for x in mats]), [feats, targets]))
    feats, targets = list(map(lambda x: x.swapaxes(0, 1), [feats, targets]))  # Time x Batch x Features
    return feats, targets, np.asarray(sequence_lengths, dtype=np.int32)


def batcher(dataset):
    batch, t = [], 0
    for d in dataset:
        batch.append(d)
        t = max(t, d[0].shape[0])
        # because of padding, all elems in the batch have the same length
        # (which is equal to the max elem length)
        if t * len(batch) < MAX_WORDS_IN_BATCH:
            batch.append(d)
        else:
            yield stack_and_mask(batch)  # actually, we can do it later and yield just the batch list
            batch, t = [], 0
    if len(batch):
        yield stack_and_mask(batch)

        raise StopIteration

In [6]:
#==========================================

prefix = "../../Anglicizm/taggers/"

random.seed(777)

MAX_PS_LENGTH = 10 # maximal prefix or suffix length
MAX_WORD_LENGTH = 2 * MAX_PS_LENGTH
# words shorter than MAX_WORD_LENGTH will be truncated in the middle
# so that both prefix and suffix are MAX_PS_LENGTH

charmap = {"EMPTY":0, "BEGIN":1, "END":2}
with open(os.path.join(prefix, "chars.txt"), 'r') as charfile:
    num = len(charmap)
    for line in charfile.readlines():
        char = line.split()[0]
        for ch in char.split('|'):
            charmap[ch] = num
        num += 1
charmap["UNKNOWN"] = num

# pprint.pprint(sorted(charmap.items()))


In [7]:
catlens = Counter()
catmap = defaultdict(dict)

with open(os.path.join(prefix,"output_map.txt"), 'r') as mf:
    for l in mf:
        cat, catname, catint = l.strip().split()
        catlens.update([cat])
        catmap[cat][int(catint)] = catname
catlens = sorted(catlens.most_common())

#[('Gender', 4),('Mood', 3), ('Number', 3), ..]
pprint.pprint(catlens)


[('AdForm', 3),
 ('Animacy', 3),
 ('Aspect', 3),
 ('Case', 13),
 ('Degree', 4),
 ('Gender', 5),
 ('Mood', 4),
 ('Number', 3),
 ('Other', 6),
 ('POS', 19),
 ('Person', 4),
 ('Tense', 4),
 ('Transition', 3),
 ('VerbForm', 4),
 ('Voice', 4)]


In [51]:
tags_map, POS_map

({'Animacy': {'Anim': 1, 'Inan': 0},
  'Case': {'Acc': 2, 'Dat': 1, 'Gen': 5, 'Ins': 4, 'Loc': 3, 'Nom': 0},
  'Degree': {'Cmp': 1, 'Pos': 0},
  'Form': {'Digit': 0},
  'Gender': {'Fem': 0, 'Masc': 1, 'Neut': 2},
  'Mood': {'Imp': 1, 'Ind': 0},
  'Number': {'Plur': 1, 'Sing': 0},
  'Person': {'1': 1, '2': 2, '3': 0},
  'Tense': {'Notpast': 1, 'Past': 0, 'Pres': 2},
  'Variant': {'Short': 0},
  'VerbForm': {'Conv': 2, 'Fin': 0, 'Inf': 1},
  'Voice': {'Act': 0, 'Mid': 1}},
 {'ADJ': 8,
  'ADP': 4,
  'ADV': 10,
  'CONJ': 7,
  'DET': 0,
  'H': 11,
  'INTJ': 12,
  'NOUN': 1,
  'NUM': 9,
  'PART': 6,
  'PRON': 3,
  'PUNCT': 5,
  'VERB': 2})

In [46]:
catmap

defaultdict(dict,
            {'AdForm': {0: 'UNDEFINED', 1: 'brev', 2: 'plen'},
             'Animacy': {0: 'UNDEFINED', 1: 'anim', 2: 'inan'},
             'Aspect': {0: 'UNDEFINED', 1: 'pf', 2: 'ipf'},
             'Case': {0: 'UNDEFINED',
              1: 'nom',
              2: 'voc',
              3: 'gen',
              4: 'gen2',
              5: 'dat',
              6: 'acc',
              7: 'dat2',
              8: 'ins',
              9: 'loc',
              10: 'loc2',
              11: 'acc2',
              12: 'adnum'},
             'Degree': {0: 'UNDEFINED', 1: 'comp', 2: 'supr', 3: 'comp2'},
             'Gender': {0: 'UNDEFINED', 1: 'm', 2: 'f', 3: 'n', 4: 'm-f'},
             'Mood': {0: 'UNDEFINED', 1: 'indic', 2: 'imper', 3: 'imper2'},
             'Number': {0: 'UNDEFINED', 1: 'sg', 2: 'pl'},
             'Other': {0: 'UNDEFINED',
              1: 'persn',
              2: 'patrn',
              3: 'famn',
              4: 'zoon',
              5: '0'},
          

In [9]:
cats = list(zip(*catlens))[0]

trainfn, devfn, testfn = list(map(lambda x: os.path.join(prefix, x) + "_half.txt", ["train", "dev", "test"]))
trainset, devset, testset = list(map(make_dataset, [trainfn, devfn, testfn])) # already matrix
print(len(trainset), len(devset), len(testset))

def get_batches(dataset, randomize):
    sorted_dataset = sorted(dataset, key=lambda x: random.random() if randomize else x[0].shape[0])
    return [b for b in batcher(sorted_dataset)]

dev, test = list(map(lambda x: get_batches(x, randomize=False), [devset, testset]))
print(list(map(len, [dev, test])))


../../Anglicizm/taggers/train_half.txt
../../Anglicizm/taggers/dev_half.txt
../../Anglicizm/taggers/test_half.txt
127807 85204 42449
[8334, 4140]


In [59]:
dev[0][1].shape

(1, 201, 15)

In [11]:
#===========NN======================================

graph = tf.Graph()

model_filename = "/home/user1/projects/Anglicizm/taggers/mystem_half.model.chkpt"

alphabet_size = len(charmap)
num_outputs = len(cats)

char_embedding_size = alphabet_size
conv_filter_widths = [3, 4, 5]  # shingle sizes
num_conv_maps = 128
rnn_cell_size = 128
rnn_projection_size = 128

In [None]:
with graph.as_default():
    # inputs
    input_words = tf.placeholder(tf.int32, shape=(None, None, MAX_WORD_LENGTH))  # time x batch x chars
    input_sequence_lengths = tf.placeholder(tf.int32, shape=(None,))  # batch
    output_targets = tf.placeholder(tf.int32, shape=(None, None, num_outputs))  # time x batch x output_id
    output_targets_mask = tf.placeholder(tf.bool, shape=(None, None, num_outputs))  # time x batch x output_id

    # shapes
    max_sequence_length = tf.shape(input_words)[0]
    batch_size = tf.shape(input_words)[1]

    # computation

    # first we embed chars into some dense space
    char_embeddings = tf.Variable(tf.random_uniform([alphabet_size, char_embedding_size],
                                                    minval=-np.sqrt(3), maxval=np.sqrt(3)),dtype=tf.float32)

    input_words_embedded = tf.nn.embedding_lookup(params=char_embeddings,ids=input_words)  # time x batch x chars x feats

    # now we convolve over them to reduce dimensionality
    rnn_inputs = []
    for conv_filter_width in conv_filter_widths:
        filter_tensor = tf.Variable(
            tf.truncated_normal([1, conv_filter_width, char_embedding_size, num_conv_maps], stddev=0.1),dtype=tf.float32)  # filter_height x filter_width x in_channels x out_channels
        input_words_shingled = tf.nn.conv2d(input=input_words_embedded, filter=filter_tensor,
                                            strides=[1, 1, 1, 1], padding="VALID")
        input_words_shingled_pooled = tf.nn.max_pool(value=input_words_shingled,
                                                     ksize=[1, 1, input_words_shingled.get_shape()[2], 1],
                                                     strides=[1, 1, 1, 1], padding="VALID")
        input_words_shingled_pooled = tf.squeeze(input_words_shingled_pooled, squeeze_dims=[2])
        rnn_inputs.append(input_words_shingled_pooled)
    rnn_inputs = tf.concat(2, rnn_inputs)

    # ready to build the rnn
    cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=rnn_cell_size, num_proj=rnn_projection_size, state_is_tuple=True,initializer=None)
    cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=rnn_cell_size, num_proj=rnn_projection_size, state_is_tuple=True,initializer=None)
    rnn_output, _, _ = bidirectional_dynamic_rnn(cell_forward=cell_fw, initial_state_forward=None,
                                                 cell_backward=cell_bw, initial_state_backward=None,
                                                 inputs=rnn_inputs, sequence_lengths=input_sequence_lengths)

    multihead_loss, predictions = compute_masked_multihead_loss(bottleneck=rnn_output, target=output_targets,
                                                                mask=output_targets_mask)
    xents, correct_counts, ns = zip(*multihead_loss)
    loss = tf.reduce_sum(xents)
    accuracies = {cat: (correct_counts[i], ns[i]) for i, cat in enumerate(cats)}
    optimizer = get_optimizer(tf.train.AdamOptimizer(learning_rate=1e-4), loss)
    saver = tf.train.Saver()

num_epochs = 15
best_val = None
logfile = open("mystem_half_logger.log", "w")

from itertools import starmap

def log(obj):
    print (obj)
    print(logfile, obj)

with tf.Session(graph=graph, config=tf.ConfigProto(log_device_placement=True)) as session:
    tf.initialize_all_variables().run()
    # saver.restore(session, model_filename)
    print('Initialized')

    def do_epoch(batches, do_backprop, mode):
        acc_loss, counter = np.float128(0), np.uint64(0)
        guesses = [[np.uint64(0), np.uint64(0)] for cat in cats]
        num_steps = len(batches)
        for step, batch in enumerate(batches):
            feats, tgts, seq_lens = batch
            feed_dict = {
                input_words: feats,
                input_sequence_lengths: seq_lens,
                output_targets: tgts,
                output_targets_mask: (tgts != 0)
            }
            acc_fetches = [accuracies[cat] for cat in cats] #correct_counts

            if do_backprop:
                fetch = session.run([optimizer, loss] + acc_fetches, feed_dict=feed_dict)
                step_loss = fetch[1]
                accs = fetch[2:]
            else:
                fetch = session.run([loss] + acc_fetches, feed_dict=feed_dict)
                step_loss = fetch[0]
                accs = fetch[1:]
            acc_loss += step_loss * seq_lens.sum()
            counter += seq_lens.sum()
            for i, (correct, n) in enumerate(accs):
                guesses[i][0] += correct
                guesses[i][1] += n
            if step%100 == 0:    
                print ("\rAverage loss at step %d / %d: %f" % (step, num_steps, acc_loss / counter))
        acc_loss /= counter
        guesses = sorted(zip(cats, list(starmap(lambda x,y: (round(x / y, 3), int(y)) if y != 0 else "UNDEFINED", guesses))))
        res = "%s: epoch avg loss: % f" % (mode, acc_loss)
        log(res)
        log(guesses)
        return acc_loss

    for epoch in range(num_epochs + 1):
        print("Starting epoch %d" % epoch)
        val = do_epoch(dev, do_backprop=False, mode="VALID") # здесь оцениваем модель, обученную на train, на dev сете
        if best_val is None:
            # saverpath = saver.save(session, model_filename)
            # print(saverpath)
            # saver.restore(session, model_filename)
            best_val = val
        elif best_val < val:
            saver.restore(session, model_filename)
        elif best_val > val:
            best_val = val
            saver.save(session, model_filename)
        if epoch != num_epochs:
            train = get_batches(trainset, randomize=True)
            tr = do_epoch(train, do_backprop=True, mode="TRAIN") # обучаем модель
        logfile.flush()

    saver.restore(session, model_filename)
    log("FINAL RUN ON DEDICATED TESTSET")
    do_epoch(test, do_backprop=False, mode="TEST")


    def do_test(text):

        POS_results = []
        system_results = []
        words = text.split()
        inputs = np.stack([word2vec(word) for word in words])
        inputs = np.expand_dims(inputs, 1)
        lens = inputs.shape[0] * np.ones([1])
        with tf.Session(graph=graph) as session:
            saver.restore(session, model_filename)
            feed_dict = {
                input_words: inputs,
                input_sequence_lengths: lens,
            }
            res = session.run(list(predictions.values()), feed_dict=feed_dict)
            res = list(map(lambda x: (np.argmax(x, 2), np.max(x, 2)), res))
            res = sorted(zip(predictions.keys(), res))
            for i, w in enumerate(words):
                # print('Word: ', w)
                one_word_res = []
                for line in res:
                    cat = line[0]
                    smth = line[1]
                    indexs, scores = smth[0], smth[1]
                    # print(cat, catmap[cat][indexs[i][0]], scores[i][0])
                    one_word_res.append(catmap[cat][indexs[i][0]])
                    if cat == 'POS':
                        POS_results.append(w+"_"+catmap[cat][indexs[i][0]])
                system_results.append(one_word_res)
        #print(system_results)
        #print(' '.join(POS_results))
        return system_results


    main_cats = 'AdForm Animacy Aspect Case Degree Gender Mood Number Other POS Person Tense Transition VerbForm Voice'
    with open('test_processed_corpus.txt', 'r', encoding='utf-8') as my_testset:
        golden_tags = []  # array of words where every word is another array and has gold categories
        golden_words = []
        for line in my_testset.readlines():
            if 'SENTENCE' not in line:
                word = line.split()[0]
                categ = line.split()[1:]
                word_tags = []
                for real_cat, value in zip(main_cats.split(), categ):
                    tag = catmap[real_cat][int(value)]
                    word_tags.append(tag)
                golden_tags.append(word_tags)
                golden_words.append(word)
        #print(golden_words)
        #print(golden_tags)

    system_results = do_test(' '.join(golden_words))
    # do_test(u"всё на свете должно происходить медленно и неправильно чтобы не сумел загордиться человек чтобы человек был грустен и растерян .")
    print(len(golden_tags), len(system_results))

    eval_obj = Evaluation(golden_tags, system_results)
    eval_obj.count_accuracy()