In [101]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

In [102]:
def dummy_generator(n=1000, length=20):
    for _ in range(1000):
        dummy_data = []
        for i in range(length):
            token={'lemma': int(np.random.uniform(1,1000)), 'surface': 'dummy_'+str(i), 'label': int(np.random.uniform(0,10))}
            dummy_data.append(token)
        yield dummy_data

next(dummy_generator())

[{'lemma': 270, 'surface': 'dummy_0', 'label': 9},
 {'lemma': 684, 'surface': 'dummy_1', 'label': 7},
 {'lemma': 806, 'surface': 'dummy_2', 'label': 0},
 {'lemma': 996, 'surface': 'dummy_3', 'label': 7},
 {'lemma': 109, 'surface': 'dummy_4', 'label': 6},
 {'lemma': 672, 'surface': 'dummy_5', 'label': 3},
 {'lemma': 773, 'surface': 'dummy_6', 'label': 7},
 {'lemma': 650, 'surface': 'dummy_7', 'label': 3},
 {'lemma': 265, 'surface': 'dummy_8', 'label': 4},
 {'lemma': 37, 'surface': 'dummy_9', 'label': 8},
 {'lemma': 496, 'surface': 'dummy_10', 'label': 4},
 {'lemma': 451, 'surface': 'dummy_11', 'label': 9},
 {'lemma': 37, 'surface': 'dummy_12', 'label': 2},
 {'lemma': 695, 'surface': 'dummy_13', 'label': 3},
 {'lemma': 526, 'surface': 'dummy_14', 'label': 7},
 {'lemma': 172, 'surface': 'dummy_15', 'label': 6},
 {'lemma': 558, 'surface': 'dummy_16', 'label': 1},
 {'lemma': 760, 'surface': 'dummy_17', 'label': 1},
 {'lemma': 276, 'surface': 'dummy_18', 'label': 1},
 {'lemma': 99, 'surface'

In [205]:
tf.reset_default_graph()
charEntries = " 0123456789abcdefghijklmnopqrstuvwxyzäöåABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÅ" + \
                ".,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|\u2013\u2014\u201C\u201D"

casingEntries = ['PADDING', 'other', 'numeric', 'mainly_numeric', 'allLower',
                   'allUpper', 'mainly_allUpper', 'initialUpper', 'contains_upper',
                   'contains_digit']
charEmbedding = 'cnn'
casingEntries= ['PADDING', 'other', 'numeric', 'mainly_numeric', 'allLower',
               'allUpper', 'mainly_allUpper', 'initialUpper', 'contains_upper',
               'contains_digit']
labelEntries= ['B-PER', 'B-LOC', 'B-ORG', 'B-PRO', 'B-OTH',
                'I-PER', 'I-LOC', 'I-ORG', 'I-PRO', 'I-OTH', 'O']
classifier  = 'crf'
sess = tf.Session()
with sess.as_default():
    sentence_length= tf.placeholder(tf.int32, [None], name='sentence_length')

    tokens_input = tf.placeholder(tf.int32, [None, None], name='words_input')
    #W = tf.Variable(tf.constant(0.0, shape=[len(self.word2Idx), len(self.embeddings)], name="W_token"), trainable=False)
    W = tf.Variable(tf.random_uniform([10000, 300], -1.0, 1.0), name="W_char")
    embeddings = tf.placeholder(tf.float32, [10000, 300])
    W.assign(embeddings)
    tokens = tf.nn.embedding_lookup(W, tokens_input, name='tokens')
    print(tokens.shape)

    casing_input = tf.placeholder(tf.int32, [None, None],name='casing_input')
    W = tf.Variable(tf.random_uniform([len(casingEntries), 30], -1.0, 1.0), name="W_case")
    casings = tf.nn.embedding_lookup(W, casing_input, name='casings')
    print(casings.shape)

    chars_input = tf.placeholder(tf.int32, [None, None, 50], name='char_input')
    W = tf.Variable(tf.random_uniform([len(charEntries), 300], -1.0, 1.0), name="W_char")
    chars = tf.nn.embedding_lookup(W, chars_input, name='char_emd')
    print(chars.shape)
    if charEmbedding== 'lstm':
        chars = tf.reshape(casings, [-1, 50, 300])
        lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(30, name="char_fw_lstm")
        lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(30, name="char_bw_lstm")
        (output_fw, output_bw), _ = \
            tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, chars, dtype=tf.float32)
        chars = tf.concat([output_fw, output_bw], axis=-1)
        chars = tf.reshape(chars, [-1, tf.shape(chars_input)[-2], 60])
    else:
        chars = tf.layers.Conv2D(30, [1, 30], padding='same', name='char_cnn')(chars)
        chars = tf.layers.MaxPooling2D([1, 50], strides=50, name="char_pooling")(chars)
        chars = tf.reshape(chars, [-1, tf.shape(chars_input)[-2], 30])
    print(chars.shape)

    label = tf.placeholder(tf.int32, [None, None, 1])
    input_nodes = [tokens, casings, chars]
    merged = tf.concat([_ for _ in input_nodes], axis=2)
    print(merged.shape)
    merged_input_shape = tf.shape(merged)
    cnt = 1
    for size in (100,100):
        lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell(size, name="merged_fw_lstm_"+ str(cnt))
        lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell(size, name="merged_bw_lstm" + str(cnt))
        if isinstance((0.25,0.25), (list, tuple)):    
            lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_fw_cell, 
                                                         input_keep_prob=1 - 0.25,
                                                         output_keep_prob=1 - 0.25)
            lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_bw_cell,
                                                         input_keep_prob=1 - 0.25,
                                                         output_keep_prob=1 - 0.25)
            (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, merged, sequence_length=sentence_length,
                                           dtype=tf.float32)
            merged = tf.concat([output_fw, output_bw], axis=-1)
        cnt += 1

    print(merged.shape)
    merged = tf.reshape(merged, [-1, 200])
    if classifier == 'softmax':
        merged = tf.layers.Dense(len(labelEntries), activation=tf.nn.softmax, dtype=tf.float32)(merged)
        merged = tf.reshape(merged, [-1, merged_input_shape[-2], 1, len(labelEntries)])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=merged)
        #output = tf.cast(tf.argmax(merged, axis=-1), tf.int32)
    elif classifier == 'crf':
        merged = tf.layers.Dense(len(labelEntries), name="hidden_lin_layer")(merged)
        merged = tf.reshape(merged, [-1, merged_input_shape[-2], len(labelEntries)])
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(merged, tf.squeeze(label), sentence_length)
        loss = -log_likelihood
        print(merged.shape)
        #output = np.array([tf.contrib.crf.viterbi_decode(_, transition_params) for _ in merged.eval()])
   

lossFct = tf.reduce_mean(loss)


(?, ?, 300)
(?, ?, 30)
(?, ?, 50, 300)
(?, ?, 30)
(?, ?, 360)
(?, ?, 200)
(?, ?, 11)


In [179]:
if 

precision = tf.metrics.precision(y_true, y_pred)
recall = tf.metrics.recall(y_true, y_pred)
f1_score = 2*precision*recall/(precision+recall)

In [201]:
print(transition_params.shape)



(11, 11)


ValueError: setting an array element with a sequence.