In [None]:
import tensorflow as tf
# from tensorflow.keras import Model, Input
# from tensorflow.keras.layers import LSTM, Embedding, Dense
# from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
# from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# import numpy as np
# from sklearn.metrics import classification_report
import json
import functools


from pathlib import Path
Path('results').mkdir(exist_ok=True)



# path = Path(__file__).parent

# import sys
# sys.path.append(".")
# from datasets.dataset import Dataset

# class LSTMCRF:

#     def __init__(self, vocabulary_size, labels_size, embeddings_size=50, lstm_units=100, dropout=0.2):
#         input_word = Input(shape=(embeddings_size,))
#         model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
#         model = SpatialDropout1D(dropout)(model)
#         model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
#         out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
#         self.model = Model(input_word, out)
#         self.checkpoint = ModelCheckpoint("{}/model.h5".format(path), monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')

#     def fit(self, data):
#         self.model.compile(optimizer="adam",
#             loss="sparse_categorical_crossentropy",
#             metrics=["accuracy"])

#         X_train, X_test, y_train, y_test = data.Xy

#         self.model.fit(
#             x=X_train,
#             y=y_train,
#             validation_data=(X_test,y_test),
#             batch_size=32,
#             epochs=3,
#             callbacks=[self.checkpoint, EarlyStopping()],
#             verbose=1
#         )

#     def test(self, data):
#         _, X, _, y = data.Xy
#         self.model.evaluate(X, y)
#         n_sequences, sequence_len = X.shape
#         predictions = self.model.predict(X)
#         flat_predictions = []
#         flat_gold = []
#         with open("{}/predictions.txt".format(path), "w") as f:
#             f.write(f"WORD\tPREDICTION\tGOLD STANDARD\n")
#             for sequence_i in range(n_sequences):
#                 f.write("\n")
#                 for word_i in range(sequence_len):
#                     word = data.idx2word[X[sequence_i, word_i]]
#                     prediction_i = np.argmax(predictions[sequence_i, word_i, :])
#                     prediction = data.idx2tag[prediction_i]
#                     flat_predictions.append(prediction)
#                     truth = data.idx2tag[y[sequence_i, word_i]]
#                     flat_gold.append(truth)
#                     f.write(f"{word}\t{prediction}\t{truth}\n")
#         with open("{}/metric-report.txt".format(path), "w") as f:
#             report = classification_report(flat_gold, flat_predictions)
#             f.write(report)

#     def save(self):
#         # serialize model to JSON
#         model_json = self.model.to_json()
#         with open("{}/model.json".format(path), "w") as json_file:
#             json_file.write(model_json)
#         # serialize weights to HDF5
#         self.model.save_weights("{}/model.h5".format(path))
#         print("Saved model to disk")


# if __name__ == "__main__":
#     dataset = Dataset(path="./data/processed_data/gmb-1.0.0.csv", maxlen=50)
#     model = LSTMCRF(dataset.vocabulary_size, dataset.labels_size)
#     model.fit(dataset)
#     model.test(dataset)
#     model.save()
#     # print(path)


DATADIR = './data/processed_data/gmb'


def parse_fn(line_words, line_tags):
    # Encode in Bytes for TF
    words = [w.encode() for w in line_words.strip().split()]
    tags = [t.encode() for t in line_tags.strip().split()]
    assert len(words) == len(tags), "Words and tags lengths don't match"
    return (words, len(words)), tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    dataset = (dataset
               .padded_batch(params.get('batch_size', 20), shapes, defaults)
               .prefetch(1))
    return dataset


def model_fn(features, labels, mode, params):
    input_word = Input(shape=(embeddings_size,))
    model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
    model = SpatialDropout1D(dropout)(model)
    model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
    out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
    self.model = Model(input_word, out)
    self.checkpoint = ModelCheckpoint("results/model.h5", monitor='val_loss',verbose=1, save_best_only=True, save_weights_only=True, mode='min')
    keras_estimator = tf.keras.estimator.model_to_estimator(
        keras_model=model, model_dir=model_dir)
    return tf.estimator.EstimatorSpec(
        mode, loss=loss, train_op=train_op)


if __name__ == '__main__':

    params = {
        "dim": 300,
        "dropout": 0.5,
        # "num_oov_buckets": 1,
        "epochs": 3,
        "batch_size": 32,
        "buffer": 15000,
        "lstm_size": 100,
        "words": str(Path(DATADIR, "vocabulary.txt")),
        "tags": str(Path(DATADIR, "tags.txt"))
    }

    with Path("results/params.json").open("w") as f:
        json.dump(params, f, indent=4, sort_keys=True)

    def fwords(name):
        return str(Path(DATADIR, "{}.sentences.txt".format(name)))

    def ftags(name):
        return str(Path(DATADIR, "{}.labels.txt".format(name)))

    # Estimator, train and evaluate
    train_inpf = functools.partial(input_fn, fwords('train'), ftags('train'),
                                   params, shuffle_and_repeat=True)
    eval_inpf = functools.partial(input_fn, fwords('test'), ftags('test'))

    cfg = tf.estimator.RunConfig(save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(model_fn, 'results/model', cfg, params)
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True) # TODO take this outside?
    # hook = tf.contrib.estimator.stop_if_no_increase_hook(
    #     estimator, 'f1', 500, min_steps=8000, run_every_secs=120)
    train_spec = tf.estimator.TrainSpec(input_fn=train_inpf) #, hooks=[hook])
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_inpf, throttle_secs=120)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

    # # Write predictions to file
    # def write_predictions(name):
    #     Path('results/score').mkdir(parents=True, exist_ok=True)
    #     with Path('results/score/{}.preds.txt'.format(name)).open('wb') as f:
    #         test_inpf = functools.partial(input_fn, fwords(name), ftags(name))
    #         golds_gen = generator_fn(fwords(name), ftags(name))
    #         preds_gen = estimator.predict(test_inpf)
    #         for golds, preds in zip(golds_gen, preds_gen):
    #             ((words, _), tags) = golds
    #             for word, tag, tag_pred in zip(words, tags, preds['tags']):
    #                 f.write(b' '.join([word, tag, tag_pred]) + b'\n')
    #             f.write(b'\n')

    for name in ['train', 'test']:
        write_predictions(name)

In [47]:
DATADIR = '../data/processed_data/gmb'
from pathlib import Path
import functools

def parse_fn(line_words, line_tags):
    # Encode in Bytes for TF
    words = [w.encode() for w in line_words.strip().split()]
    tags = [t.encode() for t in line_tags.strip().split()]
    assert len(words) == len(tags), "Words and tags lengths don't match"
    return (words, len(words)), tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)

def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    shapes = (([None], ()), [None])
    types = ((tf.string, tf.int32), tf.string)
    defaults = (('<pad>', 0), 'O')

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_shapes=shapes, output_types=types)

    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    dataset = (dataset
               .padded_batch(params.get('batch_size', 20), shapes, defaults)
               .prefetch(1))
    return dataset

params = {
    "dim": 300,
    "dropout": 0.5,
    # "num_oov_buckets": 1,
    "epochs": 3,
    "batch_size": 32,
    "buffer": 15000,
    "lstm_size": 100,
    "words": str(Path(DATADIR, "vocabulary.txt")),
    "tags": str(Path(DATADIR, "tags.txt"))
}

def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

# Estimator, train and evaluate
train_inpf = functools.partial(input_fn, fwords('train'), ftags('train'),
                               params, shuffle_and_repeat=True)
eval_inpf = functools.partial(input_fn, fwords('test'), ftags('test'))

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

embeddings_size=50
vocabulary_size=100
dropout=0.2
lstm_units=100
labels_size=10

input_word = Input(shape=(embeddings_size,))
model = Embedding(input_dim=vocabulary_size, output_dim=embeddings_size)(input_word)
model = SpatialDropout1D(dropout)(model)
model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
model = Model(input_word, out)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])




import tensorflow as tf

words,tags=fwords('train'), ftags('train')
shapes = (([None], ()), [None])
types = ((tf.string, tf.int32), tf.string)

dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, words, tags),
    output_shapes=shapes, output_types=types)


model.fit(x=train_inpf,
          y=y_train,
          validation_data=(X_test,y_test),
          batch_size=32,
          epochs=3,
          callbacks=[self.checkpoint, EarlyStopping()],
          verbose=1
         )

NameError: name 'y_train' is not defined

In [29]:
next(dataset)

TypeError: 'FlatMapDataset' object is not an iterator

In [48]:
# import tensorflow as tf


# shapes = ([None], [None])
# types = (tf.string, tf.string)
# defaults = ('<pad>''O')

# def generator_fn(words, tags):
#     with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
#         for line_words, line_tags in zip(f_words, f_tags):
#             yield parse_fn(line_words, line_tags)

# dataset = tf.data.Dataset.from_generator(
#     functools.partial(generator_fn, words, tags),
#     output_shapes=shapes, output_types=types)


# dataset = (dataset.padded_batch(32, shapes, defaults).prefetch(1))

# dataset

<PrefetchDataset shapes: ((None, None), (None, None)), types: (tf.string, tf.string)>

In [50]:
import tensorflow as tf


def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)
            
            
def gen_train_series(sentences, labels):
    for sentence, label in sentences, labels:
        yield sentence, label

series = tf.data.Dataset.from_generator(gen_train_series,output_types=(tf.int32, tf.int32),output_shapes = ((None, None)))



In [60]:
i=0
for a in functools.partial(generator_fn, words, tags)():
    print(a)
    break
a.shape

(([b'Ethiopia', b'has', b'reported', b'18', b'new', b'cases', b'of', b'polio', b'as', b'it', b'begins', b'a', b'nationwide', b'vaccination', b'program', b'targeting', b'more', b'than', b'16', b'million', b'children', b'under', b'the', b'age', b'of', b'five', b'.'], 27), [b'I-ORG', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O', b'O'])


AttributeError: 'tuple' object has no attribute 'shape'

In [42]:
import numpy as np
import functools
from pathlib import Path
DATADIR = '../data/processed_data/gmb'
import tensorflow as tf

def tags_dictionaries():
    with Path(params['tags']).open() as f:
        tag2idx = {t.strip(): i for i, t in enumerate(f)} 
        idx2tag = {i: t for t, i in tag2idx.items()}
    return tag2idx, idx2tag

def words_dictionaries():
    with Path(params['words']).open() as f:
        word2idx = {t.strip(): i for i, t in enumerate(f,1)}
        word2idx["UNK"]=0
        word2idx["ENDPAD"]=len(word2idx)
        idx2word = {i: t for t, i in word2idx.items()}
    return word2idx, idx2word, len(word2idx)
        
def fwords(name):
    return str(Path(DATADIR, "{}.sentences.csv".format(name)))

def ftags(name):
    return str(Path(DATADIR, "{}.labels.csv".format(name)))

def parse_fn(line_words, line_tags):
#     words = [w.encode() for w in line_words.strip().split()]
#     tags = [t.encode() for t in line_tags.strip().split()]
#     print(line_words.strip().split())
    words = np.array([word2idx.get(w, 0) for w in line_words.strip().split()])
    tags = np.array([tag2idx[t] for t in line_tags.strip().split()])
    
    assert len(words) == len(tags), "Words and tags lengths don't match"
#     return (words, len(words)), tags
    return words, tags

def generator_fn(words, tags):
    with Path(words).open('r') as f_words, Path(tags).open('r') as f_tags:
        for line_words, line_tags in zip(f_words, f_tags):
            yield parse_fn(line_words, line_tags)
                        
def input_fn(words, tags, params=None, shuffle_and_repeat=False):
    params = params if params is not None else {}
    output_signature = (
        tf.TensorSpec(shape=([None]), dtype=tf.int32),
        tf.TensorSpec(shape=([None]), dtype=tf.int32))

    dataset = tf.data.Dataset.from_generator(
        functools.partial(generator_fn, words, tags),
        output_signature=output_signature
    )
    
    if shuffle_and_repeat:
        dataset = dataset.shuffle(params['buffer']).repeat(params['epochs'])

    shapes = (tf.TensorShape([None]),tf.TensorShape([None]))        
    dataset = (dataset
               .padded_batch(batch_size=32, 
                             padded_shapes=([params["max_len"]], [params["max_len"]]), 
                             padding_values=(params['vocab_size']-1,params['pad_index'])
                            )
               .prefetch(1))
    return dataset

params = {
    "dim": 100,
    "dropout": 0.5,
    # "num_oov_buckets": 1,
    "max_len": 60,
    "epochs": 3,
    "batch_size": 32,
    "buffer": 15000,
    "lstm_size": 100,
    "words": str(Path(DATADIR, "vocabulary.txt")),
    "tags": str(Path(DATADIR, "tags.txt"))
}

tag2idx, idx2tag = tags_dictionaries()
word2idx, idx2word, vocab_size = words_dictionaries()

params['vocab_size']=vocab_size
params['pad_index']=tag2idx['O']


from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

embeddings_size=50
# vocabulary_size=100
dropout=0.2
lstm_units=100
labels_size=10


input_word = Input(shape=(params["max_len"],))
model = Embedding(input_dim=params['vocab_size'], output_dim=embeddings_size, input_length=params["max_len"])(input_word)
model = SpatialDropout1D(dropout)(model)
model = Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=dropout))(model)
out = TimeDistributed(Dense(labels_size, activation="softmax"))(model)
model = Model(input_word, out)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

dataset = functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()
model.fit(dataset, epochs=1)


9233
Epoch 1/3
Epoch 2/3
 63/316 [====>.........................] - ETA: 15s - loss: 0.1125 - accuracy: 0.9605

KeyboardInterrupt: 

In [None]:
# model.summary()

In [28]:
# for i in range(100):
#     for _ in functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)():
#         pass
# #     break
# # a

In [44]:
dataset = functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()
for a in dataset:
    print(a)
    break

(<tf.Tensor: shape=(32, 60), dtype=int32, numpy=
array([[1039,  189,  331, ..., 9232, 9232, 9232],
       [1079,  387, 8058, ..., 9232, 9232, 9232],
       [  41,   99, 1063, ..., 9232, 9232, 9232],
       ...,
       [  41,  692, 4638, ..., 9232, 9232, 9232],
       [  41,  277, 3072, ..., 9232, 9232, 9232],
       [   0, 5189, 5190, ..., 9232, 9232, 9232]], dtype=int32)>, <tf.Tensor: shape=(32, 60), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 3, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 5, 5, ..., 0, 0, 0]], dtype=int32)>)


In [48]:
b = a[0][4].numpy()
[idx2word[t] for t in b]

['The',
 'Kremlin',
 'Friday',
 'said',
 'Mr.',
 'Blair',
 'expressed',
 'regret',
 'that',
 'intensive',
 'work',
 'on',
 'forming',
 'a',
 'new',
 'Cabinet',
 'would',
 'prevent',
 'him',
 'from',
 'attending',
 'the',
 'ceremonies',
 'Monday',
 '.',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD',
 'ENDPAD']

In [None]:
# idx2word.keys()

In [221]:
list(word2idx.items())[-2]

('survive', 9231)

In [15]:
# functools.partial(input_fn, fwords('train'), ftags('train'), params, shuffle_and_repeat=True)()

words = fwords('train')
tags = ftags('train')
output_signature = (
    tf.TensorSpec(shape=([None]), dtype=tf.int32),
    tf.TensorSpec(shape=([None]), dtype=tf.int32))

dataset = tf.data.Dataset.from_generator(
    functools.partial(generator_fn, words, tags),
    output_signature=output_signature
)

In [23]:
for i in dataset:
    if len(i[0])> 50:
        print(len(i[0]))
#     break

55
55
55


In [50]:
import os
os.path.dirname(os.path.abspath(__file__))

NameError: name '__file__' is not defined