In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
    
class Dataset(object):
    def __init__(self, path, maxlen):
        """
        Defines self.df as a df with the loaded dataset (for now) ... TODO complete docstring
        Args:
            path: path to the dataset file to load
        """         
        self.data = pd.read_csv(path)
        self.data = self.data.fillna(method="ffill")
        self.maxlen = maxlen
        
        self.vocabulary = list(set(self.data["Word"].values)) + ["ENDPAD"]
        self.vocabulary_size = len(self.vocabulary)
        self.labels = list(set(self.data["Tag"].values))
        self.labels_size = len(self.labels)

        self.word2idx = {w: i + 1 for i, w in enumerate(self.vocabulary)}
        self.idx2word = {i + 1: w for i, w in enumerate(self.vocabulary)}
        self.tag2idx = {t: i for i, t in enumerate(self.labels)}
        self.idx2tag = {i: t for i, t in enumerate(self.labels)}        
        self.sentences = self.make_sentences()
        
    def make_sentences(self):
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        grouped = self.data.groupby("Sentence #").apply(agg_func)
        sentences = [s for s in grouped]
        return sentences
    
    def make_sequences(self):
        X = [[self.word2idx[w[0]] for w in s] for s in self.sentences]
        y = [[self.tag2idx[w[2]] for w in s] for s in self.sentences]
        return X,y
        
    def pad_sequences(self, X, y):
        X = pad_sequences(maxlen=self.maxlen, sequences=X, padding="post", value=self.vocabulary_size)
        y = pad_sequences(maxlen=self.maxlen, sequences=y, padding="post", value=self.tag2idx["O"])
        return X,y

    @property
    def Xy(self, test_size=0.2, random_state=42):
        X, y = self.make_sequences()
        X, y = self.pad_sequences(X, y)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        return X_train, X_test, y_train, y_test


In [None]:
ds = Dataset(path="../data/processed_data/gmb-1.0.0.csv", maxlen=50)

In [None]:
X_train, X_test, y_train, y_test = ds.Xy

In [None]:
X_train[0]

In [None]:
X_test.shape

In [None]:
import numpy as np
import sys
sys.path.append("..")
from models.lstm_crf.train import LSTMCRF

ds = Dataset(path="../data/processed_data/gmb-1.0.0.csv", maxlen=50)
model = LSTMCRF(ds.vocabulary_size, ds.labels_size)

idx2word = ds.idx2word
idx2tag = ds.idx2tag
_, X, _, y = ds.Xy
n_sequences, sequence_len = X.shape

# LSTMCRF(dataset.vocabulary_size, dataset.labels_size)


predictions = model.model.predict(X)
flat_predictions = []
flat_gold = []

with open("./predictions.txt", "w") as f:
#     f.write(f"WORD\tPREDICTION\tGOLD STANDARD\n")
    for sequence_i in range(n_sequences)[0:1]:
        for word_i in range(sequence_len):
            word = idx2word[X[sequence_i, word_i]]
            prediction_i = np.argmax(predictions[sequence_i, word_i, :])
            prediction = ds.idx2tag[prediction_i]
            flat_predictions.append(prediction)
            truth = ds.idx2tag[y[sequence_i, word_i]]
            flat_gold.append(truth)
#             f.write(f"{word}\t{prediction}\t{truth}\n")
        


In [None]:
sequence_len

In [None]:
sequence_i, word_i = 0,0
idx2word[X[sequence_i, word_i]]
prediction_i = np.argmax(predictions[sequence_i, word_i, :])
ds.idx2tag[prediction_i]
ds.idx2tag[y[sequence_i, word_i]]

In [None]:
ds.idx2tag[y[sequence_i, word_i]]

In [None]:
ds.vocabulary_size-1
idx2word

In [None]:
ds.Xy

In [None]:
ds.idx2tag

In [None]:
flat_predictions

In [None]:
# flat_gold

In [12]:
from pathlib import Path

def read_file(input_sentences, input_labels, callback=None):
    input_features = []
    input_labels = []
    with Path(input_sentences).open('r') as features, Path(input_labels).open('r') as labels:
        for line_words, line_tags in zip(features.readlines(), labels.readlines()):
            input_features.append(line_words)
            input_labels.append(line_tags)
    return line_words, line_tags

sentences = "/Users/lievgarcia/Documents/nlp_specialisation/ner_tagger/data/processed_data/gmb/train.sentences.csv"
labels = "/Users/lievgarcia/Documents/nlp_specialisation/ner_tagger/data/processed_data/gmb/train.labels.csv"

# line_words, line_tags = read_file(sentences, labels)

with Path(sentences).open('r') as features, Path(labels).open('r') as labels:
    for line_words, line_tags in zip(features.readlines(), labels.readlines()):
        data = (line_words,line_tags)
        print(len(data[0]))
#     print(f.readlines())
            

154
105
132
144
44
136
131
141
146
199
166
37
25
32
57
76
135
254
123
80
97
82
110
112
205
96
137
43
138
98
153
103
118
45
117
103
105
75
132
117
164
60
121
106
174
52
161
74
141
22
145
102
71
126
67
152
130
34
132
125
125
125
105
88
61
165
185
132
96
208
102
114
156
187
150
234
145
69
122
63
122
100
104
107
102
103
153
157
163
178
173
87
96
189
57
139
146
124
144
107
82
120
142
93
128
67
103
79
67
87
126
58
85
86
76
133
97
94
73
77
115
78
128
145
235
186
133
71
156
154
40
25
151
148
161
118
164
111
119
119
84
95
87
136
128
120
94
77
75
151
127
55
152
141
37
101
62
118
109
126
105
85
160
126
51
155
107
134
135
46
121
99
104
145
150
34
124
67
128
112
180
153
149
178
78
145
36
173
105
124
91
135
167
111
122
146
58
123
86
57
76
83
153
114
107
90
95
146
62
84
91
155
124
162
185
104
117
46
91
51
102
102
112
80
121
27
130
145
114
221
146
234
109
114
56
95
126
173
126
146
55
85
131
167
153
59
102
89
46
131
176
149
163
165
96
69
151
250
90
168
144
128
160
127
92
144
134
153
175
49
74
32
113
90

135
67
82
164
93
50
120
51
83
52
177
127
112
80
100
98
183
150
79
91
64
129
139
29
131
65
142
92
125
89
58
56
185
162
60
96
114
138


In [19]:
import argparse

def add_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default='bilstm-crf', choices=[
        'bilstm-crf'
    ])
#     parser.add_argument('--vocab_file', type=str, required=True)
#     parser.add_argument('--label_file', type=str, required=True)
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--dropout', type=float, default=0.1)
    parser.add_argument('--train_input_sentences_file', type=str, default=None)
    parser.add_argument('--train_input_labels_file', type=str, default=None)
    parser.add_argument('--train_batch_size', type=int, default=32)
    parser.add_argument('--valid_input_sentences_file', type=str, default=None)
    parser.add_argument('--valid_input_labels_file', type=str, default=None)
    parser.add_argument('--valid_batch_size', type=int, default=32)
    parser.add_argument('--vocab_size', type=int, default=None)
    parser.add_argument('--embedding_size', type=int, default=None)
    parser.add_argument('--hidden_size', type=int, default=256)
    args, _ = parser.parse_known_args()
    return args

args = add_arguments()

In [22]:
args.num_classes=10

In [23]:
args

Namespace(dropout=0.1, embedding_size=None, epochs=10, hidden_size=256, model='bilstm-crf', num_classes=10, train_batch_size=32, train_input_labels_file=None, train_input_sentences_file=None, valid_batch_size=32, valid_input_labels_file=None, valid_input_sentences_file=None, vocab_size=None)