In [1]:
from collections import Counter, defaultdict
from itertools import count
import random

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression

import sys
import numpy as np

# format of files: each line is "word1/tag2 word2/tag2 ..."
train_file="tagged_train.txt"
test_file="tagged_test.txt"



class Vocab:
    def __init__(self, w2i=None):
        if w2i is None: w2i = defaultdict(int)
        self.w2i = dict(w2i)
        self.i2w = {i:w for w,i in w2i.items()}
    @classmethod
    def from_corpus(cls, corpus):
        w2i = defaultdict(int)
        for sent in corpus:
            [w2i[word] for word in sent]
        return Vocab(w2i)

    def size(self): return len(self.w2i.keys())

def read(fname):
    """
    Read a POS-tagged file where each line is of the form "word1/tag2 word2/tag2 ..."
    Yields lists of the form [(word1,tag1), (word2,tag2), ...]
    """
    with open(fname) as fh:
        for line in fh:
            line = line.strip().split()
            sent = [tuple(x.rsplit("/",1)) for x in line]
            yield sent



In [3]:
train=list(read(train_file))
words=[]
tags=[]
chars=set()
tagset = set()
wc=Counter()
for sent in train:
    for w,p in sent:
        words.append(w)#.lower())#float(w.split("word")[1]))
        if(len(w)>25):
            print("over 25!", w)
        tags.append(p)
        tagset.add(p)
        chars.update(w)#.lower())
        wc[w]+=1
# words.append("_UNK_")
# chars.add("<*>")


# words_t=[]
# tags_t=[]
#wc_t=Counter()
# for sent in test:
#     for w,p in sent:
#         words_t.append(w)
#         tags_t.append(p)#(float(p.split("tag")[1]))
#         tagset.add(p)
#         chars.update(w)
        #wc_t[w]+=1
# # words_t.append("_UNK_")
# # chars_t.add("<*>")

# vw = Vocab.from_corpus([words]) 
# vt = Vocab.from_corpus([tags])
# vc = Vocab.from_corpus([chars])
# # UNK = vw.w2i["_UNK_"]

# nwords = vw.size()
# ntags  = vt.size()
# nchars  = vc.size()

def w2v(word, d):
	wvec = []
	for c in word:
		wvec.append(d[c])
	return wvec

dictionary = {i:j for j,i in enumerate(list(chars))}
tagdict = {i:j for j,i in enumerate(list(tagset))}

trainW = []
for word in words:
	#wvec = np.zeros(14)
	#wvec[int(word)-1] = 1.0
	trainW.append(w2v(word, dictionary))
trainW = np.array(trainW)# .reshape(len(words), 50000)

trainT = []	
for tag in tags:
# 	tvec = np.zeros(10)
# 	tvec[int(tag)-1] = 1.0
	trainT.append(tagdict[tag])
trainT = np.array(trainT)#.reshape(len(tags), 1)

# testW = []
# for word in words_t:
# # 	wvec = np.zeros(14)
# # 	wvec[int(word)-1] = 1.0
# 	testW.append(w2v(word, dictionary))
# testW = np.array(testW)# .reshape(len(words_t), 50000)

# testT = []	
# for tag in tags_t:
# # 	tvec = np.zeros(10)
# # 	tvec[int(tag)-1] = 1.0
# 	testT.append(tagdict[tag])
# testT = np.array(testT)#.reshape(len(tags_t), 1)



In [4]:
# Converting labels to binary vectors
chars

{'!',
 '$',
 '&',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'Z',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [5]:
charvocab = len(chars)
maxbound = 25
valid = 5000
#Pre-processing
# Sequence padding
trainW = pad_sequences(trainW, maxlen=maxbound, value=0.)
testW = trainW[-valid:]
trainW = trainW[:-valid]
# Converting labels to binary vectors
trainT = to_categorical(trainT, nb_classes=len(tagset))
testT = trainT[-valid:]
trainT = trainT[:-valid]
#testT = to_categorical(testT, nb_classes=len(tagset))




In [5]:

#dd[chars.pop() for i in range(len(chars))
# np.array(testT[3])
#trainT.shape
len(tagset)

181

In [6]:



# DyNet Starts
# TFlearn starts



# Data preprocessing
# print("nwords: ", nwords)
# print("ntags: ", ntags)
# print("train[:5] = ", train[:5])

# Network building (char-level)
net = input_data(shape=[None, maxbound])
net = embedding(net, input_dim=charvocab, output_dim=256)
net = bidirectional_rnn(net, BasicLSTMCell(256), BasicLSTMCell(256))
net = dropout(net, 0.5)
net = fully_connected(net, len(tagset), activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy')

# Training



model = tflearn.DNN(net, tensorboard_verbose=0)

model.fit(trainW, trainT, validation_set=(testW, testT), show_metric=True, batch_size=64)

# ##example prediction after model set
# print("testW[0]: ", testW[0])
# print("Prediction of testW[0]: ", model.predict(testW[0]))





Training Step: 7040  | total loss: [1m[32m0.45511[0m[0m
| Adam | epoch: 010 | loss: 0.45511 - acc: 0.8540 | val_loss: 0.56175 - val_acc: 0.8388 -- iter: 45000/45000
Training Step: 7040  | total loss: [1m[32m0.45511[0m[0m
| Adam | epoch: 010 | loss: 0.45511 - acc: 0.8540 | val_loss: 0.56175 - val_acc: 0.8388 -- iter: 45000/45000
--


In [10]:
testW[:20]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  7, 13, 11, 63],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        46, 27, 24, 24, 16, 32, 64, 20],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 53],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 51, 23, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 13, 67],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0, 51, 14, 63],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 54, 63,
        67, 63, 32, 54, 18, 32, 51, 20],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0, 53],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [33]:
modelfile = ".."
#for i in range(10):
    #print( np.argmax(model.predict(testW[:10])[i]))

def i2t(index):
    return list(tagset)[index]
def is2w(indices):
    switch = 0
    word = []
    for i in range(maxbound):
        if(indices[i]!=0):
            switch = 1
        if(switch == 1):
            word.append(list(chars)[indices[i]])
    
    return "".join(word)    
    
#print([i2t(np.argmax(a)) for a in testT[:20]])
#print([i2t(np.argmax(a)) for a in model.predict(testW[:20])])
window = 40
print("Input:")
print(" ".join([is2w(testW[:window][i]) for i in range(len(testW[:window]))]))
print("\nPrediction: ")
print([is2w(testW[:window][i])+"/" +i2t(np.argmax(a)) for i, a in enumerate(model.predict(testW[:window]))])
print("\nAnswer: ")
print([is2w(testW[:window][i])+"/" +i2t(np.argmax(a)) for i, a in enumerate(testT[:window])])
model.save(modelfile)

Input:
Hope Pullings , two of the defendants , and also introduced Pullings to Jessy Maroy , a man mentioned in the indictment but not indicted . Buaford Robinson , 23 , of 7026 Stewart Av. , a CTA bus driver

Prediction: 
['Hope/NN-TL', 'Pullings/NP', ',/,', 'two/CD', 'of/IN', 'the/AT', 'defendants/NNS', ',/,', 'and/CC', 'also/RB', 'introduced/VBN', 'Pullings/NP', 'to/IN', 'Jessy/NP', 'Maroy/NP', ',/,', 'a/AT', 'man/NN', 'mentioned/VBN', 'in/IN', 'the/AT', 'indictment/NN', 'but/CC', 'not/*', 'indicted/VBD', './.', 'Buaford/NP', 'Robinson/NP', ',/,', '23/CD', ',/,', 'of/IN', '7026/CD', 'Stewart/NP', 'Av./NN-TL', ',/,', 'a/AT', 'CTA/NN', 'bus/CC', 'driver/NN']

Answer: 
['Hope/NP', 'Pullings/NP', ',/,', 'two/CD', 'of/IN', 'the/AT', 'defendants/NNS', ',/,', 'and/CC', 'also/RB', 'introduced/VBD', 'Pullings/NP', 'to/IN', 'Jessy/NP', 'Maroy/NP', ',/,', 'a/AT', 'man/NN', 'mentioned/VBN', 'in/IN', 'the/AT', 'indictment/NN', 'but/CC', 'not/*', 'indicted/VBN', './.', 'Buaford/NP', 'Robinson/NP'