# Libraries, Headers and Stuff

In [1]:
import nltk
from nltk.corpus import treebank

In [2]:
import keras

Using TensorFlow backend.


In [3]:
import numpy as np

In [4]:
import sys
sys.path.append('../src')

# Hyperparameters and Constants

In [5]:
CACHED = True

SEQUENCE_LEN = 50
EMBEDDING_DIM = 128
BATCH_SIZE = 128
N_EPOCHS = 1

# Prepares the training data

In [6]:
tagged_words = list(treebank.tagged_words(tagset='universal'))

all_words = [w[0] for w in tagged_words]
all_tags = [w[1] for w in tagged_words]

distinct_words = list(set(all_words))
distinct_tags = list(set(all_tags))

n_words = len(distinct_words)
n_tags = len(distinct_tags)

print 'All words:'
print all_words[:15]
print 'Total:', len(all_words)
print 'Distinct:', len(distinct_words)
print  ''
print 'All tags:'
print all_tags[:15]
print 'Total:', len(all_tags)
print 'Distinct:', len(distinct_tags)

All words:
[u'Pierre', u'Vinken', u',', u'61', u'years', u'old', u',', u'will', u'join', u'the', u'board', u'as', u'a', u'nonexecutive', u'director']
Total: 100676
Distinct: 12408

All tags:
[u'NOUN', u'NOUN', u'.', u'NUM', u'NOUN', u'ADJ', u'.', u'VERB', u'VERB', u'DET', u'NOUN', u'ADP', u'DET', u'ADJ', u'NOUN']
Total: 100676
Distinct: 12


In [7]:
word2idx = {w:i for (i,w) in enumerate(distinct_words)}
idx2word = {v:k for (k,v) in word2idx.items()}

tag2idx = {w:i for (i,w) in enumerate(distinct_tags)}
idx2tag = {v:k for (k,v) in word2idx.items()}

In [8]:
N_train = int(len(all_words) * .8)
N_test  = len(all_words) - N_train
print 'Size Training Set:', N_train
print 'Size Testing Set:', N_test

words_train = [word2idx[w] for w in all_words[:N_train]]
words_test = [word2idx[w] for w in all_words[N_train:]]

tags_train = [tag2idx[w] for w in all_tags[:N_train]]
tags_test = [tag2idx[w] for w in all_tags[N_train:]]

Size Training Set: 80540
Size Testing Set: 20136


## Vectorizes

In [9]:
def vectorize(seq, window_size):
    out = []
    for i in range(len(seq) - window_size):
        out.append(seq[i:i+window_size])
    out = np.array(out)
    return out

X_train = vectorize(words_train, SEQUENCE_LEN)
X_test  = vectorize(words_test, SEQUENCE_LEN)

y_train = vectorize(tags_train, SEQUENCE_LEN)
y_test = vectorize(tags_test, SEQUENCE_LEN)

y_train = keras.utils.to_categorical(y_train, n_tags)
y_train = np.reshape(y_train, (X_train.shape[0], X_train.shape[1], n_tags))

y_test = keras.utils.to_categorical(y_test, n_tags)
y_test = np.reshape(y_test, (X_test.shape[0], X_test.shape[1], n_tags))

print 'X train shape:'
print X_train.shape
print 'X test shape:'
print X_test.shape

print 'X train sample:'
print X_train[:2,]

print 'y train shape:'
print y_train.shape
print 'y test shape:'
print y_test.shape

print 'y train sample:'
print y_train[:2,]

X train shape:
(80490, 50)
X test shape:
(20086, 50)
X train sample:
[[12196  4523  6347 10526   964   638  6347 12148  6949  3674  3698  1492
   4820 11003 10612 10085  7948   519  5525  4523  6669 11193 10722  4938
  10676  6347  3674  7075  1997  7137   519  8996 11911  6347  9862   964
    638 10371  1271 11193 10722  2254 12046  4628 12150  6347  3790  8568
   4449  4820]
 [ 4523  6347 10526   964   638  6347 12148  6949  3674  3698  1492  4820
  11003 10612 10085  7948   519  5525  4523  6669 11193 10722  4938 10676
   6347  3674  7075  1997  7137   519  8996 11911  6347  9862   964   638
  10371  1271 11193 10722  2254 12046  4628 12150  6347  3790  8568  4449
   4820 11003]]
y train shape:
(80490, 50, 12)
y test shape:
(20086, 50, 12)
y train sample:
[[[ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  1.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  ..., 
  [ 0.  0.  0. ...,  0.  0.  1.]
  [ 0.  0.  0. ...,  0.  0.  0.]
  [ 0.  0.  0. ...,  0.  0.  0.]]

 [[ 0.  1.  0. ...,  0.

# Creates the architecture

In [10]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import TimeDistributed, Bidirectional
from keras.layers import LSTM
from keras.models import load_model

In [11]:
NTAGS = n_tags
NWORDS = n_words

model = Sequential()
model.add(Embedding(NWORDS, EMBEDDING_DIM, input_length=SEQUENCE_LEN))
model.add(Bidirectional(LSTM(128, dropout=.1, return_sequences=True)))
model.add(LSTM(128, dropout=.1, return_sequences=True))
model.add(TimeDistributed(Dense(NTAGS, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [12]:
if not CACHED:
    for i in range(N_EPOCHS):
        history = model.fit(X_train, y_train,
                            batch_size=BATCH_SIZE,
                            epochs=1,
                            verbose=1,
                            validation_split=0.1)
        model.save('../models/pos_tag_normal')
else:
    model = load_model('../models/pos_tagging_model')

In [13]:
score = model.evaluate(X_test, y_test, batch_size=BATCH_SIZE, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

('Test score:', 0.2370081886524498)
('Test accuracy:', 0.94630389030528328)


# Extracts all kind of features

See that paper: http://nlp.lsi.upc.edu/papers/gimenez03.pdf

and that one http://www.lsi.upc.es/~nlp/SVMTool/lrec2004-gm.pdf

In [28]:
# BEWARE - INSERTED STUPID MAX HASH 

import features as ff

features = [] 
feature_names = []

for n in [1,2]:
    for d in [0,1,2]:
        f = ff.n_grams(n, 10, d)
        features.append(f)

print len(features)

6
['n_gram_1', 'n_gram_1_lag1', 'n_gram_1_lag2', 'n_gram_2', 'n_gram_2_lag1', 'n_gram_2_lag2']


In [29]:
# TO DO: Same thing for PREVIOUS POS TAGS

In [30]:
# Sentence type
ignored_char = [',', '-', '#', '%', '$', '&', '*', '@']

ends_exc = ff.sentence_ends('!',ignored_char)
features.append(ends_exc)

ends_question = ff.sentence_ends('?',ignored_char)
features.append(ends_question)

ends_dot = ff.sentence_ends('.',ignored_char)
features.append(ends_dot)

print len(features)

9


In [31]:
# # Prefixes and Suffixes
# for n in [1,2,3]:
#     for pref in [True, False]:
#         features.append(ends_exc)
#         name = 'prefix' if pref else 'suffix'
#         name += '_' + str(n)
#         feature_names.append(name)

# print len(features)
# print feature_names

# Extracts the Hidden States

In [32]:
import extractor
reload(extractor)

ex=extractor.Extractor(model)
states, nn_config = ex.run_for_layer('Bidirectional', X_test)

print 'states shape:', states.shape
print 'config:',nn_config

Gets the activations for the hidden states
Reshapes
Done
states shape: (20086, 12800)
config: {'input_size': 50, 'states_struct': [('Bidirectional', 50, 256)]}


# Inspects

In [36]:
import inspector as insp
reload(insp)
import scores
reload(scores)

sequence = all_words[N_train:]
params = {}
for i in range(50):
    params[(0,i)] = (scores.Correlation(), i, 0)

insp = insp.Inspector(states, nn_config)
out = insp.inspect(sequence, features[:1], params)

Generating feature scores
Running feature 0 out of 1
Added features ['n_gram_0', 'n_gram_1', 'n_gram_2', 'n_gram_3', 'n_gram_4', '...']
Tidying...
Computed feature matrix, with shape: (20136, 10)
Computing score for feature 0: n_gram_0
Computing score for feature 1: n_gram_1
Computing score for feature 2: n_gram_2
Computing score for feature 3: n_gram_3
Computing score for feature 4: n_gram_4
Computing score for feature 5: n_gram_5
Computing score for feature 6: n_gram_6
Computing score for feature 7: n_gram_7
Computing score for feature 8: n_gram_8
Computing score for feature 9: n_gram_9


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12790,12791,12792,12793,12794,12795,12796,12797,12798,12799
0,0.032491,0.057243,0.069023,0.046311,0.08063,0.07421,0.020371,0.042217,0.029495,0.097828,...,0.017085,0.015661,0.052999,0.043413,0.049659,0.002896,0.023939,0.075775,0.014351,0.045156
1,0.28318,0.123163,0.063573,0.116988,0.071565,0.070403,0.226742,0.410992,0.330819,0.042532,...,0.021756,0.241536,0.322075,0.242403,0.065826,0.114902,0.284599,0.441066,0.131421,0.245951
2,0.090895,0.1522,0.176562,0.178038,0.096006,0.125474,0.259177,0.229808,0.257045,0.263507,...,0.227088,0.305888,0.17665,0.261857,0.117274,0.023645,0.287028,0.121856,0.099642,0.007846
3,0.069721,0.028495,0.029548,0.040393,0.027462,0.043828,0.106868,0.027294,0.074797,0.066709,...,0.043191,0.067422,0.049583,0.067046,0.048862,0.087359,0.094623,0.021352,0.000307,0.043359
4,0.16086,0.005712,0.094638,0.062823,0.034226,0.051961,0.298289,0.234565,0.215162,0.16849,...,0.151088,0.263169,0.168199,0.196978,0.167279,0.06484,0.28306,0.149965,0.013304,0.075525
