In [100]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
    
class Dataset(object):
    def __init__(self, path, maxlen):
        """
        Defines self.df as a df with the loaded dataset (for now) ... TODO complete docstring
        Args:
            path: path to the dataset file to load
        """         
        self.data = pd.read_csv(path)
        self.data = self.data.fillna(method="ffill")
        self.maxlen = maxlen
        
        self.vocabulary = list(set(self.data["Word"].values)) + ["ENDPAD"]
        self.vocabulary_size = len(self.vocabulary)
        self.labels = list(set(self.data["Tag"].values))
        self.labels_size = len(self.labels)

        self.word2idx = {w: i + 1 for i, w in enumerate(self.vocabulary)}
        self.idx2word = {i + 1: w for i, w in enumerate(self.vocabulary)}
        self.tag2idx = {t: i for i, t in enumerate(self.labels)}
        self.idx2tag = {i: t for i, t in enumerate(self.labels)}        
        self.sentences = self.make_sentences()
        
    def make_sentences(self):
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        grouped = self.data.groupby("Sentence #").apply(agg_func)
        sentences = [s for s in grouped]
        return sentences
    
    def make_sequences(self):
        X = [[self.word2idx[w[0]] for w in s] for s in self.sentences]
        y = [[self.tag2idx[w[2]] for w in s] for s in self.sentences]
        return X,y
        
    def pad_sequences(self, X, y):
        X = pad_sequences(maxlen=self.maxlen, sequences=X, padding="post", value=self.vocabulary_size)
        y = pad_sequences(maxlen=self.maxlen, sequences=y, padding="post", value=self.tag2idx["O"])
        return X,y

    @property
    def Xy(self, test_size=0.2, random_state=42):
        X, y = self.make_sequences()
        X, y = self.pad_sequences(X, y)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test


In [50]:
ds = Dataset(path="../data/processed_data/gmb-1.0.0.csv", maxlen=50)

In [51]:
X_train, X_test, y_train, y_test = ds.Xy

In [48]:
X_train[0]

array([ 6848,  7592,  3234,  5170,  6972,  5035,  3877, 10325, 10325,
       10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325,
       10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325,
       10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325,
       10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325, 10325,
       10325, 10325, 10325, 10325, 10325], dtype=int32)

In [47]:
X_test.shape

(848, 50)

In [101]:
import numpy as np
import sys
sys.path.append("..")
from models.lstm_crf.train import LSTMCRF

ds = Dataset(path="../data/processed_data/gmb-1.0.0.csv", maxlen=50)
model = LSTMCRF(ds.vocabulary_size, ds.labels_size)

idx2word = ds.idx2word
idx2tag = ds.idx2tag
_, X, _, y = ds.Xy
n_sequences, sequence_len = X.shape

# LSTMCRF(dataset.vocabulary_size, dataset.labels_size)


predictions = model.model.predict(X)
flat_predictions = []
flat_gold = []

with open("./predictions.txt", "w") as f:
#     f.write(f"WORD\tPREDICTION\tGOLD STANDARD\n")
    for sequence_i in range(n_sequences)[0:1]:
        for word_i in range(sequence_len):
            word = idx2word[X[sequence_i, word_i]]
            prediction_i = np.argmax(predictions[sequence_i, word_i, :])
            prediction = ds.idx2tag[prediction_i]
            flat_predictions.append(prediction)
            truth = ds.idx2tag[y[sequence_i, word_i]]
            flat_gold.append(truth)
#             f.write(f"{word}\t{prediction}\t{truth}\n")
        


InvalidArgumentError:  indices[24,14] = 10326 is not in [0, 10326)
	 [[node model_8/embedding_8/embedding_lookup (defined at <ipython-input-101-12f116d18f69>:17) ]] [Op:__inference_predict_function_9574]

Errors may have originated from an input operation.
Input Source operations connected to node model_8/embedding_8/embedding_lookup:
 model_8/embedding_8/embedding_lookup/9072 (defined at /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/contextlib.py:81)

Function call stack:
predict_function


In [None]:
sequence_len

In [102]:
sequence_i, word_i = 0,0
idx2word[X[sequence_i, word_i]]
prediction_i = np.argmax(predictions[sequence_i, word_i, :])
ds.idx2tag[prediction_i]
ds.idx2tag[y[sequence_i, word_i]]

'O'

In [103]:
ds.idx2tag[y[sequence_i, word_i]]

'O'

In [108]:
ds.vocabulary_size-1
idx2word

{1: 'exploring',
 2: 'Garden',
 3: 'forcefully',
 4: 'toxic',
 5: 'coup',
 6: 'Total',
 7: 'southern',
 8: 'ends',
 9: 'Parlemannews',
 10: 'individuals',
 11: 'commemorations',
 12: 'controversy',
 13: '1990s',
 14: '1499',
 15: 'Arctic',
 16: 'widening',
 17: 'Jean-',
 18: 'Great',
 19: 'clocking',
 20: 'Antonio',
 21: 'questioning',
 22: 'Nadi',
 23: 'transparency',
 24: 'Lech',
 25: 'sixteen',
 26: 'name',
 27: 'Specific',
 28: 'Town',
 29: 'feuding',
 30: 'Haitian',
 31: 'lawsuit',
 32: 'agreed',
 33: 'kidnap',
 34: 'Congo',
 35: 'terrorism',
 36: 'ISAF',
 37: 'Fund',
 38: 'Security',
 39: 'Indonesia-Malaysia',
 40: 'Herald',
 41: 'hamper',
 42: 'talked',
 43: 'jazz',
 44: 'Qingdao',
 45: 'referred',
 46: 'breaks',
 47: 'dependency',
 48: '54',
 49: 'claiming',
 50: 'Organiation',
 51: 'surprised',
 52: 'denied',
 53: 'doctor',
 54: 'Part',
 55: 'nomination',
 56: 'conducting',
 57: 'Qalat',
 58: 'Robin',
 59: 'abundant',
 60: 'brisk',
 61: 'Solomon',
 62: 'VIII',
 63: 'establishe

In [105]:
ds.Xy

(array([[ 6848,  7592,  3234, ..., 10326, 10326, 10326],
        [ 6554,  7483,  5472, ..., 10326, 10326, 10326],
        [ 9723,  8040,  6640, ..., 10326, 10326, 10326],
        ...,
        [ 5458,  8272,  4480, ..., 10326, 10326, 10326],
        [ 3926,  4157,  8040, ..., 10326, 10326, 10326],
        [ 5807,  5615,  4006, ..., 10326, 10326, 10326]], dtype=int32),
 array([[ 1071,  8040,  6993, ..., 10326, 10326, 10326],
        [ 1821,  2779,  6648, ..., 10326, 10326, 10326],
        [ 4651,  6809,  7241, ..., 10326, 10326, 10326],
        ...,
        [ 6554, 10086,  5501, ..., 10326, 10326, 10326],
        [ 4028,  9058,  9120, ..., 10326, 10326, 10326],
        [ 4225,  7604,  3991, ..., 10326, 10326, 10326]], dtype=int32),
 array([[8, 0, 2, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32),
 array([[0, 0, 8, ..., 0, 0, 0],
 

In [93]:
ds.idx2tag

{0: 'O',
 1: 'I-MON',
 2: 'I-PER',
 3: 'I-ART',
 4: 'I-PCT',
 5: 'I-LOC',
 6: 'I-DAT',
 7: 'I-TIM',
 8: 'I-ORG',
 9: 'I-TTL'}

'O'

In [89]:
flat_predictions

['I-DAT',
 'I-DAT',
 'I-PCT',
 'I-DAT',
 'I-PCT',
 'I-PCT',
 'I-PCT',
 'O',
 'I-PCT',
 'I-PCT',
 'I-PCT',
 'I-TIM',
 'I-DAT',
 'O',
 'O',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PER',
 'I-PCT',
 'I-PCT']

In [90]:
flat_gold

['O',
 'O',
 'I-ORG',
 'I-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']