In [None]:
import tensorflow as tf
# from tensorflow.keras import Model, Input
# from tensorflow.keras.layers import LSTM, Embedding, Dense
# from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
# from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# import numpy as np
# from sklearn.metrics import classification_report
import json
import functools


from pathlib import Path
Path('results').mkdir(exist_ok=True)



# path = Path(__file__).parent

# import sys
# sys.path.append(".")
# from datasets.dataset import Dataset

# class LSTMCRF:

#     def __init__(self, vocabulary_size, labels_size, embeddings_size=50, lstm_units=100, dropout=0.2):
#         input_word = Input(shape=(embeddings_size,))
#         model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
#         model = SpatialDropout1D(dropout)(model)
#         model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
#         out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
#         self.model = Model(input_word, out)
#         self.checkpoint = ModelCheckpoint("{}/model.h5".format(path), monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')

#     def fit(self, data):
#         self.model.compile(optimizer="adam",
#             loss="sparse_categorical_crossentropy",
#             metrics=["accuracy"])

#         X_train, X_test, y_train, y_test = data.Xy

#         self.model.fit(
#             x=X_train,
#             y=y_train,
#             validation_data=(X_test,y_test),
#             batch_size=32,
#             epochs=3,
#             callbacks=[self.checkpoint, EarlyStopping()],
#             verbose=1
#         )

#     def test(self, data):
#         _, X, _, y = data.Xy
#         self.model.evaluate(X, y)
#         n_sequences, sequence_len = X.shape
#         predictions = self.model.predict(X)
#         flat_predictions = []
#         flat_gold = []
#         with open("{}/predictions.txt".format(path), "w") as f:
#             f.write(f"WORD\tPREDICTION\tGOLD STANDARD\n")
#             for sequence_i in range(n_sequences):
#                 f.write("\n")
#                 for word_i in range(sequence_len):
#                     word = data.idx2word[X[sequence_i, word_i]]
#                     prediction_i = np.argmax(predictions[sequence_i, word_i, :])
#                     prediction = data.idx2tag[prediction_i]
#                     flat_predictions.append(prediction)
#                     truth = data.idx2tag[y[sequence_i, word_i]]
#                     flat_gold.append(truth)
#                     f.write(f"{word}\t{prediction}\t{truth}\n")
#         with open("{}/metric-report.txt".format(path), "w") as f:
#             report = classification_report(flat_gold, flat_predictions)
#             f.write(report)

#     def save(self):
#         # serialize model to JSON
#         model_json = self.model.to_json()
#         with open("{}/model.json".format(path), "w") as json_file:
#             json_file.write(model_json)
#         # serialize weights to HDF5
#         self.model.save_weights("{}/model.h5".format(path))
#         print("Saved model to disk")


# if __name__ == "__main__":
#     dataset = Dataset(path="./data/processed_data/gmb-1.0.0.csv", maxlen=50)
#     model = LSTMCRF(dataset.vocabulary_size, dataset.labels_size)
#     model.fit(dataset)
#     model.test(dataset)
#     model.save()
#     # print(path)


DATADIR = './data/processed_data/gmb'


def parse_fn(line_words, line_tags):
    # Encode in Bytes for TF
    words = [w.encode() for w in line_words.strip().split()]
    tags = [t.encode() for t in line_tags.strip().split()]
    assert len(words) == len(tags), "Words and tags lengths don't match"
    return (words, len(words)), tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    dataset = (dataset
               .padded_batch(params.get('batch_size', 20), shapes, defaults)
               .prefetch(1))
    return dataset


def model_fn(features, labels, mode, params):
    input_word = Input(shape=(embeddings_size,))
    model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
    model = SpatialDropout1D(dropout)(model)
    model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
    out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
    self.model = Model(input_word, out)
    self.checkpoint = ModelCheckpoint("results/model.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
    keras_estimator = tf.keras.estimator.model_to_estimator(
        keras_model=model, model_dir=model_dir)
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, train_op=train_op)


if __name__ == '__main__':

    params = {
        "dim": 300,
        "dropout": 0.5,
        # "num_oov_buckets": 1,
        "epochs": 3,
        "batch_size": 32,
        "buffer": 15000,
        "lstm_size": 100,
        "words": str(Path(DATADIR, "vocabulary.txt")),
        "tags": str(Path(DATADIR, "tags.txt"))
    }

    with Path("results/params.json").open("w") as f:
        json.dump(params, f, indent=4, sort_keys=True)

    def fwords(name):
        return str(Path(DATADIR, "{}.sentences.txt".format(name)))

    def ftags(name):
        return str(Path(DATADIR, "{}.labels.txt".format(name)))

    # Estimator, train and evaluate
    train_inpf = functools.partial(input_fn, fwords('train'), ftags('train'),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(input_fn, fwords('test'), ftags('test'))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True) # TODO take this outside?
    # hook = tf.contrib.estimator.stop_if_no_increase_hook(
    #     estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf) #, hooks=[hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # # Write predictions to file
    # def write_predictions(name):
    #     Path('results/score').mkdir(parents=True, exist_ok=True)
    #     with Path('results/score/{}.preds.txt'.format(name)).open('wb') as f:
    #         test_inpf = functools.partial(input_fn, fwords(name), ftags(name))
    #         golds_gen = generator_fn(fwords(name), ftags(name))
    #         preds_gen = estimator.predict(test_inpf)
    #         for golds, preds in zip(golds_gen, preds_gen):
    #             ((words, _), tags) = golds
    #             for word, tag, tag_pred in zip(words, tags, preds['tags']):
    #                 f.write(b' '.join([word, tag, tag_pred]) + b'\n')
    #             f.write(b'\n')

    for name in ['train', 'test']:
        write_predictions(name)

In [None]:
DATADIR = '../data/processed_data/gmb'
from pathlib import Path
import functools

def parse_fn(line_words, line_tags):
    # Encode in Bytes for TF
    words = [w.encode() for w in line_words.strip().split()]
    tags = [t.encode() for t in line_tags.strip().split()]
    assert len(words) == len(tags), "Words and tags lengths don't match"
    return (words, len(words)), tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    dataset = (dataset
               .padded_batch(params.get('batch_size', 20), shapes, defaults)
               .prefetch(1))
    return dataset

params = {
    "dim": 300,
    "dropout": 0.5,
    # "num_oov_buckets": 1,
    "epochs": 3,
    "batch_size": 32,
    "buffer": 15000,
    "lstm_size": 100,
    "words": str(Path(DATADIR, "vocabulary.txt")),
    "tags": str(Path(DATADIR, "tags.txt"))
}

def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

# Estimator, train and evaluate
train_inpf = functools.partial(input_fn, fwords('train'), ftags('train'),
                               params, shuffle_and_repeat=True)
eval_inpf = functools.partial(input_fn, fwords('test'), ftags('test'))

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

embeddings_size=50
vocabulary_size=100
dropout=0.2
lstm_units=100
labels_size=10

input_word = Input(shape=(embeddings_size,))
model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
model = SpatialDropout1D(dropout)(model)
model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
model = Model(input_word, out)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])




import tensorflow as tf

words,tags=fwords('train'), ftags('train')
shapes = (([None], ()), [None])
types = ((tf.string, tf.int32), tf.string)

dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, words, tags),
    output_shapes=shapes, output_types=types)


model.fit(x=train_inpf,
          y=y_train,
          validation_data=(X_test,y_test),
          batch_size=32,
          epochs=3,
          callbacks=[self.checkpoint, EarlyStopping()],
          verbose=1
         )

In [None]:
next(dataset)

In [None]:
# import tensorflow as tf


# shapes = ([None], [None])
# types = (tf.string, tf.string)
# defaults = ('<pad>''O')

# def generator_fn(words, tags):
#     with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
#         for line_words, line_tags in zip(f_words, f_tags):
#             yield parse_fn(line_words, line_tags)

# dataset = tf.data.Dataset.from_generator(
#     functools.partial(generator_fn, words, tags),
#     output_shapes=shapes, output_types=types)


# dataset = (dataset.padded_batch(32, shapes, defaults).prefetch(1))

# dataset

In [None]:
import tensorflow as tf


def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)
            
            
def gen_train_series(sentences, labels):
    for sentence, label in sentences, labels:
        yield sentence, label

series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))



In [None]:
i=0
for a in functools.partial(generator_fn, words, tags)():
    print(a)
    break
a.shape

In [73]:
with Path(params['tags']).open() as f:
    tag2idx = {
        "<PAD>": "0",
        "<UNK>": "1"
    }    
    for i, t in enumerate(f,2):
        tag2idx[t.strip()] = i
        print(i)
#     _ = [tag2idx[t.strip()] = i for i, t in enumerate(f,2)]

tag2idx
    
# for i, t in enumerate(f,2):
#     print(t)
# _ = [tag2idx[t.strip()] = i for i, t in enumerate(f,2)]

2
3
4
5
6
7
8
9
10
11


{'<PAD>': '0',
 '<UNK>': '1',
 'O': 2,
 'I-TTL': 3,
 'I-TIM': 4,
 'I-PER': 5,
 'I-PCT': 6,
 'I-ORG': 7,
 'I-MON': 8,
 'I-LOC': 9,
 'I-DAT': 10,
 'I-ART': 11}

In [74]:
word2idx["UNK"]

0

In [122]:
import numpy as np
import functools
from pathlib import Path
DATADIR = '../data/processed_data/gmb'
import tensorflow as tf

def tags_dictionaries():
    with Path(params['tags']).open() as f:
        tag2idx = {t.strip(): i for i, t in enumerate(f)} 
        idx2tag = {i: t for t, i in tag2idx.items()}
    return tag2idx, idx2tag

def words_dictionaries():
    with Path(params['words']).open() as f:
        word2idx = {t.strip(): i for i, t in enumerate(f,1)}
        word2idx["UNK"]=0
        word2idx["ENDPAD"]=len(word2idx)
        idx2word = {i: t for t, i in word2idx.items()}
    return word2idx, idx2word, len(word2idx)
        
def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

def parse_fn(line_words, line_tags):
#     words = [w.encode() for w in line_words.strip().split()]
#     tags = [t.encode() for t in line_tags.strip().split()]
#     print(line_words.strip().split())
    words = np.array([word2idx.get(w, 0) for w in line_words.strip().split()])
    tags = np.array([tag2idx[t] for t in line_tags.strip().split()])
    
    assert len(words) == len(tags), "Words and tags lengths don't match"
#     return (words, len(words)), tags
    return words, tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)
                        
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    output_signature = (
        tf.TensorSpec(shape=([None]), dtype=tf.int32),
        tf.TensorSpec(shape=([None]), dtype=tf.int32))

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_signature=output_signature
    )
    
    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    shapes = (tf.TensorShape([None]),tf.TensorShape([None]))        
    dataset = (dataset
               .padded_batch(batch_size=32, 
                             padded_shapes=([params["max_len"]], [params["max_len"]]), 
                             padding_values=(params['vocab_size']-1,params['pad_index'])
                            )
               .prefetch(1))
    return dataset

params = {
    "dim": 100,
    "dropout": 0.5,
    # "num_oov_buckets": 1,
    "max_len": 60,
    "epochs": 3,
    "batch_size": 32,
    "buffer": 15000,
    "lstm_size": 100,
    "words": str(Path(DATADIR, "vocabulary.txt")),
    "tags": str(Path(DATADIR, "tags.txt"))
}

tag2idx, idx2tag = tags_dictionaries()
word2idx, idx2word, vocab_size = words_dictionaries()

params['vocab_size']=vocab_size
params['pad_index']=tag2idx['O']


from tensorflow.keras import Model, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Lambda
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

embeddings_size=50
# vocabulary_size=100
dropout=0.2
lstm_units=100
labels_size=10


# input_word = Input(shape=(params["max_len"],))
# model = Embedding(input_dim=params['vocab_size'], output_dim=embeddings_size, input_length=params["max_len"])(input_word)
# model = SpatialDropout1D(dropout)(model)
# model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
# out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
# model = Model(input_word, out)
# model.compile(optimizer="adam",
#               loss="sparse_categorical_crossentropy",
#               metrics=["accuracy"])
# model.summary()


# from keras_crf import CRF

# input_word = Input(shape=(params["max_len"],))
# sequence_mask = tf.keras.layers.Lambda(lambda x: tf.greater(x, 0))(input_word)
# model = Embedding(input_dim=params['vocab_size'], output_dim=embeddings_size, input_length=params["max_len"])(input_word)
# model = SpatialDropout1D(dropout)(model)
# model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=dropout))(model)
# logits = Dense(labels_size, activation="softmax")(model)

# # crf = CRF(params["max_len"])
# crf = CRF(len(idx2tag))
# outputs = crf(logits, mask=sequence_mask)
# model = Model(input_word, outputs)

# model.compile(
#     optimizer="adam", 
#     loss=crf.neg_log_likelihood,
#     metrics=[crf.accuracy]
#     )
# model.summary()

# from keras_crf import CRF

# sequence_input = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='sequence_input')
# sequence_mask = tf.keras.layers.Lambda(lambda x: tf.greater(x, 0))(sequence_input)
# outputs = tf.keras.layers.Embedding(21128, 128)(sequence_input)
# outputs = tf.keras.layers.Dense(256)(outputs)
# crf = CRF(7)
# # mask is important to compute sequence length in CRF
# outputs = crf(outputs, mask=sequence_mask)
# model = tf.keras.Model(inputs=sequence_input, outputs=outputs)
# model.compile(
#     loss=crf.neg_log_likelihood,
#     metrics=[
#         crf.accuracy
#     ],
#     optimizer=tf.keras.optimizers.Adam(4e-5)
# )
# model.summary()


input_word = Input(shape=(params["max_len"],))
input_mask = Lambda(lambda x: tf.greater(x, 0))(input_word)
model = Embedding(input_dim=params['vocab_size'], output_dim=params["embeddings_dim"], input_length=params["max_len"])(input_word)
model = SpatialDropout1D(params["dropout"])(model)
model = Bidirectional(LSTM(units=params["lstm_size"], return_sequences=True, recurrent_dropout=params["dropout"]))(model)
logits = Dense(params["labels_size"], activation=None)(model)
crf = CRF(params["labels_size"])
outputs = crf(logits, mask=input_mask)
model = tf.keras.Model(inputs=input_word, outputs=outputs)
model.compile(loss=crf.neg_log_likelihood, metrics=[crf.accuracy], optimizer='adam')
    
    
# optimizer = tf.keras.optimizers.Adam(4e-5)

# model.compile(optimizer=optimizer, loss=crf.loss, metrics=[crf.accuracy])


dataset = functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()
model.fit(dataset, epochs=1)


KeyError: 'embeddings_dim'

In [127]:
import numpy as np
import functools
from pathlib import Path
DATADIR = '../data/processed_data/gmb'
import tensorflow as tf

def tags_dictionaries():
    with Path(params['tags']).open() as f:
        tag2idx = {t.strip(): i for i, t in enumerate(f)} 
        idx2tag = {i: t for t, i in tag2idx.items()}
    return tag2idx, idx2tag, len(tag2idx)

def words_dictionaries():
    with Path(params['words']).open() as f:
        word2idx = {t.strip(): i for i, t in enumerate(f,1)}
        word2idx["UNK"]=0
        word2idx["ENDPAD"]=len(word2idx)
        idx2word = {i: t for t, i in word2idx.items()}
    return word2idx, idx2word, len(word2idx)
        
def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

def parse_fn(line_words, line_tags):
    words = np.array([word2idx.get(w, 0) for w in line_words.strip().split()])
    tags = np.array([tag2idx[t] for t in line_tags.strip().split()])
    
    assert len(words) == len(tags), "Words and tags lengths don't match"
    return words, tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)
                        
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    output_signature = (
        tf.TensorSpec(shape=([None]), dtype=tf.int32),
        tf.TensorSpec(shape=([None]), dtype=tf.int32))

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_signature=output_signature
    )
    
    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    shapes = (tf.TensorShape([None]),tf.TensorShape([None]))        
    dataset = (dataset
               .padded_batch(batch_size=32, 
                             padded_shapes=([params["max_len"]], [params["max_len"]]), 
                             padding_values=(params['vocab_size']-1,params['pad_index'])
                            )
               .prefetch(1))
    return dataset

params = {
    "dim": 100,
    "dropout": 0.5,
    # "num_oov_buckets": 1,
    "max_len": 60,
    "epochs": 3,
    "batch_size": 32,
    "buffer": 15000,
    "lstm_size": 100,
    "words": str(Path(DATADIR, "vocabulary.txt")),
    "tags": str(Path(DATADIR, "tags.txt")),
    "embeddings_dim": 50
}

tag2idx, idx2tag, tags_len = tags_dictionaries()
word2idx, idx2word, vocab_size = words_dictionaries()

params['vocab_size']=vocab_size
params['pad_index']=tag2idx['O']
params["labels_size"]=tags_len

from tensorflow.keras import Model, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, Lambda
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

embeddings_size=50
dropout=0.2
lstm_units=100
labels_size=10

input_word = Input(shape=(params["max_len"],))
input_mask = Lambda(lambda x: tf.greater(x, 0))(input_word)
model = Embedding(input_dim=params['vocab_size'], output_dim=params["embeddings_dim"], input_length=params["max_len"])(input_word)
model = SpatialDropout1D(params["dropout"])(model)
model = Bidirectional(LSTM(units=params["lstm_size"], return_sequences=True, recurrent_dropout=params["dropout"]))(model)
logits = Dense(params["labels_size"], activation=None)(model)
crf = CRF(params["labels_size"])
outputs = crf(logits, mask=input_mask)
model = tf.keras.Model(inputs=input_word, outputs=outputs)
model.compile(loss=crf.neg_log_likelihood, metrics=[crf.accuracy], optimizer='adam')

dataset = functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()
model.fit(dataset, epochs=1)




<tensorflow.python.keras.callbacks.History at 0x7fc175921ac8>

In [129]:
valid_dataset = functools.partial(input_fn, fwords('test'), ftags('test'), params, shuffle_and_repeat=False)()
predictions = model.predict(valid_dataset)



In [187]:
with open(fwords('test')) as f:
    test_words = f.readlines()
    
with open(ftags('test')) as f:
    test_tags = f.readlines()    

i = np.random.randint(0, predictions.shape[0]) #659
# i = 0
words = test_words[i].split()
tags = test_tags[i].split()
p = np.argmax(predictions[i], axis=-1)
# y_true = y_test[i]
print("{:15}{:5}\t {}\n".format("Word", "True", "Pred"))
print("-" *30)
for word, true_tag, pred in zip(words, tags, p[:len(words)]):
    print("{:15}{}\t{}".format(word, true_tag, idx2tag[pred]))

Word           True 	 Pred

------------------------------
"The           O	O
demonstrators  O	O
Sunday         I-DAT	I-LOC
called         O	O
on             O	O
the            O	O
government     O	O
to             O	O
ease           O	O
restrictions   O	O
on             O	O
political      O	O
activities     O	O
at             O	O
universities   O	O
,              O	O
and            O	O
to             O	O
allow          O	O
free           O	O
and            O	O
fair           O	O
student        O	O
elections      O	O
."             O	O


In [223]:
import csv
y_true, y_pred = [],[]
target_names = list(idx2tag.values())

def write_predictions(name):
    Path('results/score').mkdir(parents=True, exist_ok=True)
    with open('results/score/{}.preds.csv'.format(name), 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        golds_gen = generator_fn(fwords(name), ftags(name))
        dataset = functools.partial(input_fn, fwords(name), ftags(name), params, shuffle_and_repeat=False)()
        preds_gen = model.predict(dataset)        
        for golds, preds in zip(golds_gen, preds_gen):
            
            (words, tags) = golds
            preds = np.argmax(preds, axis=-1)[:len(words)]
            for word, tag, tag_pred in zip(words, tags, preds):
                y_true.append(idx2tag[tag])
                y_pred.append(idx2tag[tag_pred])
                writer.writerow([idx2word[word], idx2tag[tag], idx2tag[tag_pred]])
            
            
            
from sklearn.metrics import classification_report

for name in ['test']:
    write_predictions(name)    
    
print(classification_report(y_true, y_pred))    

              precision    recall  f1-score   support

       I-DAT       0.42      0.32      0.36       566
       I-LOC       0.46      0.53      0.49       771
       I-MON       0.00      0.00      0.00        44
       I-ORG       0.40      0.04      0.07       654
       I-PCT       0.00      0.00      0.00        23
       I-PER       0.46      0.03      0.05       423
       I-TIM       0.00      0.00      0.00        13
       I-TTL       0.00      0.00      0.00         1
           O       0.93      1.00      0.96     14121

    accuracy                           0.88     16616
   macro avg       0.29      0.21      0.21     16616
weighted avg       0.85      0.88      0.85     16616



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [224]:
# target_names = list(idx2tag.values())
# # classification_report(y_true, y_pred, target_names=target_names)
# print(classification_report(y_true, y_pred))

In [225]:
# y_true
# from collections import Counter
# Counter(y_true)
# # print(Counter(y_pred))

In [226]:
# target_names

In [126]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # tag2idx, idx2tag
# # word2idx, idx2word, vocab_size
# # word2idx

# word2num = word2idx
# label2ner = idx2tag
# max_len = 60

# in_sentence = "Tigray , Amhara and Oromia regions ."
# words = in_sentence.split()

# sentence = []
# for i in words:
#     if i in word2num:
#         sentence.append(word2num[i])
#     else:
#         sentence.append(word2num['UNK'])
# print(sentence)
# print("\n")
# sentence = pad_sequences([sentence], maxlen=max_len, value=vocab_size-1)
# y_pred = model.predict(sentence)
# # print(y_pred[0])
# print(y_pred[1][0])
# # y_pred = model.predict(sentence).argmax(-1)[sentence > 0]
# # ner_dict = {
# #     "PER": '',
# #     "LOC": '',
# #     "ORG": '',
# #     "O": ''
# # }
# # ner_list = {
# #     "PER": [],
# #     "LOC": [],
# #     "ORG": [],
# #     "O": []
# # }
# # print(y_pred)
# # for i in range(len(words)):
# #     ner = label2ner[y_pred[i]][-3:]
# #     ner_dict[ner] += words[i]
# #     for n, s in ner_dict.items():
# #         if n != ner and s:
# #             ner_list[n].append(s)
# #             ner_dict[n] = ''

# # print("predict result: {}".format(ner_list))


In [98]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [59]:
for i in range(predictions.shape[0]):
    for token in predictions[i]:
        if np.argmax(token)>0:
            print(np.argmax(token))

In [38]:
dataset = functools.partial(input_fn, fwords('test'), ftags('test'), params, shuffle_and_repeat=True)()
i=0
for a in dataset:
    print(a)
    break
    i+=1
print(i)

(<tf.Tensor: shape=(32, 60), dtype=int32, numpy=
array([[1204,   95,  212, ..., 9232, 9232, 9232],
       [2573, 4046,  331, ..., 9232, 9232, 9232],
       [1078,  366,  331, ..., 9232, 9232, 9232],
       ...,
       [   0,   32,  873, ..., 9232, 9232, 9232],
       [   0, 2937,   23, ..., 9232, 9232, 9232],
       [  41,  585,  586, ..., 9232, 9232, 9232]], dtype=int32)>, <tf.Tensor: shape=(32, 60), dtype=int32, numpy=
array([[7, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 5, ..., 0, 0, 0]], dtype=int32)>)
0


In [None]:
b = a[0][4].numpy()
[idx2word[t] for t in b]

In [None]:
# idx2word.keys()

In [None]:
list(word2idx.items())[-2]

In [None]:
# functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()

words = fwords('train')
tags = ftags('train')
output_signature = (
    tf.TensorSpec(shape=([None]), dtype=tf.int32),
    tf.TensorSpec(shape=([None]), dtype=tf.int32))

dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, words, tags),
    output_signature=output_signature
)

In [None]:
for i in dataset:
    if len(i[0])> 50:
        print(len(i[0]))
#     break

In [None]:
import os
os.path.dirname(os.path.abspath(__file__))