In [1]:
from __future__ import print_function
from __future__ import division

import numpy as np
import languagemodel as lm

np.random.seed(1)  # for reproducibility

In [2]:
corpus_train = lm.readCorpus("data/train.txt")
corpus_dev   = lm.readCorpus("data/dev.txt")
corpus_test  = lm.readCorpus("data/test.txt")

In [3]:
# build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0
# nwords = vocabulary size for the models that only see the indexes

w2index,nwords = lm.buildIndex(corpus_train+corpus_dev+corpus_test)

In [29]:
w2index[]

{'frank': 1,
 'extension': 0,
 'anti-takeover': 0,
 'plain': 0,
 'recover': 0,
 'air': 2,
 'watchers': 0,
 'result': 3,
 'vivid': 0,
 'failure': 0,
 'raw': 0,
 'belief': 4,
 'repeat': 0,
 'plants': 5,
 'quarter': 6,
 'represented': 0,
 'nato': 0,
 'unlike': 7,
 'emerges': 0,
 'member': 8,
 'achieved': 0,
 'reagan': 0,
 'booked': 0,
 'switzerland': 0,
 'stick': 0,
 'postpone': 0,
 'acceptance': 0,
 'desire': 0,
 'texas': 9,
 'california': 10,
 'fire': 0,
 'dynamics': 0,
 'paying': 0,
 'minorities': 0,
 'using': 0,
 'jurisdiction': 0,
 'apple': 13,
 'discounting': 0,
 'caused': 14,
 'watching': 0,
 'rumors': 0,
 'bidding': 0,
 'diet': 0,
 'redemption': 0,
 'exchange': 15,
 'attitude': 0,
 'highly': 0,
 'turned': 16,
 'alongside': 0,
 'scheme': 0,
 'frequently': 0,
 'bargain': 0,
 'sort': 0,
 'reacted': 0,
 'differ': 0,
 'candidates': 0,
 'remained': 0,
 'preceding': 0,
 'book': 0,
 'week': 17,
 'consensus': 0,
 'dog': 0,
 'accept': 0,
 'gramm-rudman': 0,
 'friend': 0,
 'hire': 0,
 'inter

In [4]:
# find words that appear in the training set so we can deal with new words separately
count_train = np.zeros((nwords,))
for snt in corpus_train:
    for w in snt:
        count_train[w2index[w]] += 1

In [5]:
# Bigram model as a baseline
alpha = 0.1 # add-alpha smoothing
probB           = lm.bigramLM(corpus_train, w2index, nwords,alpha)
LLB, N          = 0.0, 0
bi              = lm.ngramGen(corpus_dev, w2index, 2)

for w in bi:
    if (count_train[w[1]]>0): # for now, skip target words not seen in training
        LLB += np.log(probB[w[0], w[1]])
        N += 1
        
print("Bi-gram Dev LL = {0}".format(LLB / N))

Bi-gram Dev LL = -4.992084628169756


In [30]:
# Network model
print("\nNetwork model training:")
n        = 3    # Length of n-gram 
dim      = 10   # Word vector dimension
hdim     = 30  # Hidden units
neurallm = lm.neuralLM(dim, n, hdim, nwords)  # The network model

ngrams = lm.ngramGen(corpus_train,w2index,n)
ngrams2 = lm.ngramGen(corpus_dev,w2index,n)

lrate = 0.5  # Learning rate
for it in range(1): # passes through the training data
    LL, N  = 0.0, 0 # Average log-likelihood, number of ngrams    
    for ng in ngrams:
        if ng[-1] == 0:
            print("low freq words")
            break
        pr = neurallm.update(ng,lrate)
        LL += np.log(pr)
        N  += 1
    print('Train:\t{0}\tLL = {1}'.format(it, LL / N)) 

    #Dev set
    LL, N = 0.0, 0 # Average log-likelihood, number of ngrams
    for ng in ngrams2:
        if (count_train[ng[-1]]>0): # for now, skip target words not seen in training
            pr = neurallm.prob(ng)
            LL += np.log(pr)
            N  += 1
    print('Dev:\t{0}\tLL = {1}'.format(it, LL / N))


Network model training:
low freq words
Train:	0	LL = -7.406177019716568
Dev:	0	LL = -7.579955955251295


In [11]:
len(count_train)

1421

In [28]:
print(min(count_train))
bi_train = lm.ngramGen(corpus_train, w2index, 2)
ws = [a[1] for a in bi_train]
print(np.min(count_train[ws]))
# for w in bi_test:
#     print(w, count_train[w[1]])
#     if (count_train[w[1]]==0): # find words not in training set
#         print(w)
#     print(count_train[w[1]])

0.0
1.0


In [19]:
for w in bi:
    
    if (count_train[w[1]]==0): # find words not in training set
        print(w)

In [None]:
from __future__ import print_function
from __future__ import division

import numpy as np
import languagemodel as lm

np.random.seed(1)  # for reproducibility

corpus_train = lm.readCorpus("data/train.txt")
corpus_dev   = lm.readCorpus("data/dev.txt")
corpus_test  = lm.readCorpus("data/test.txt")

# build a common index (words to integers), mapping rare words (less than 5 occurences) to index 0
# nwords = vocabulary size for the models that only see the indexes

w2index,nwords = lm.buildIndex(corpus_train+corpus_dev+corpus_test)

# find words that appear in the training set so we can deal with new words separately
count_train = np.zeros((nwords,))
for snt in corpus_train:
    for w in snt:
        count_train[w2index[w]] += 1

# Bigram model as a baseline
alpha = 0.1 # add-alpha smoothing
probB           = lm.bigramLM(corpus_train, w2index, nwords,alpha)
LLB, N          = 0.0, 0
bi              = lm.ngramGen(corpus_dev, w2index, 2)
for w in bi:
    if (count_train[w[1]]>0): # for now, skip target words not seen in training
        LLB += np.log(probB[w[0], w[1]])
        N += 1
print("Bi-gram Dev LL = {0}".format(LLB / N))

# Network model
print("\nNetwork model training:")
n        = 3    # Length of n-gram 
dim      = 10   # Word vector dimension
hdim     = 30  # Hidden units
neurallm = lm.neuralLM(dim, n, hdim, nwords)  # The network model

ngrams = lm.ngramGen(corpus_train,w2index,n)
ngrams2 = lm.ngramGen(corpus_dev,w2index,n)

lrate = 0.5  # Learning rate
for it in xrange(10): # passes through the training data
    LL, N  = 0.0, 0 # Average log-likelihood, number of ngrams    
    for ng in ngrams:
        pr = neurallm.update(ng,lrate)
        LL += np.log(pr)
        N  += 1
    print('Train:\t{0}\tLL = {1}'.format(it, LL / N)) 

    #Dev set
    LL, N = 0.0, 0 # Average log-likelihood, number of ngrams
    for ng in ngrams2:
        if (count_train[ng[-1]]>0): # for now, skip target words not seen in training
            pr = neurallm.prob(ng)
            LL += np.log(pr)
            N  += 1
    print('Dev:\t{0}\tLL = {1}'.format(it, LL / N)) 


In [None]:
import numpy as np
import re

# ----------------------------------
def readCorpus(filename):
    fp = open(filename, 'r')
    corpus = []  # list of sentences. A sentence is a lists of words.
    for line in fp:
        line = line.strip()
        # replace obvious numbers with <NUM>
        line = re.sub(r'\b\d+\b', r'<NUM>', line)
        line = re.sub(r'\b\d+.\d+\b', r'<NUM>', line)
        corpus.append(line.split(' '))
    return corpus

# ----------------------------------
def buildIndex(corpus, lowthreshold=5):

    # initial index, to be modified later
    tmpindex, indx = {}, 0
    for snt in corpus:
        for w in snt:
            if (not tmpindex.has_key(w)):
                tmpindex[w] = indx
                indx += 1

    # eval word counts 
    counts = np.zeros((indx,))
    for snt in corpus:
        for w in snt:
            counts[tmpindex[w]] += 1

    # map all the words with counts leq lowthreshold to index 0
    newindex = {}
    indx = 1  # 0 reserved for low occurence words
    for w in tmpindex.keys():
        if (counts[tmpindex[w]] <= lowthreshold):
            newindex[w] = 0
        else:
            newindex[w] = indx
            indx += 1
            
    # add start symbols ... <START-2> <START-1> to the index for use with up to 5-grams
    for j in range(1, 5):
        newindex["<START-" + str(j) + ">"] = indx
        indx += 1

    return newindex, indx

# ----------------------------------
def ngramGen(corpus, w2index, n):
    """ngram generator. n is the length of the ngram."""
    assert(n <= 5)
    ngrams = []
    start_snt = ["<START-" + str(j) + ">" for j in range(4, 0, -1)]
    for snt in corpus:  # sentences
        s = start_snt[-n + 1:] + snt
        for i in xrange(n - 1, len(s)):
            ngrams.append([w2index[w] for w in s[i - n + 1:i + 1]])
    return ngrams

# -----------------------------------
def unigramLM(corpus, w2index, nwords):
    uni  = ngramGen(corpus, w2index, 1)
    prob = np.zeros((nwords,))
    for w in uni:
        prob[w[0]] += 1
    return prob / float(np.sum(prob))

# -----------------------------------
def bigramLM(corpus, w2index, nwords, alpha=0.0):
    bi   = ngramGen(corpus, w2index, 2)
    prob = np.zeros((nwords,nwords))+alpha
    for w in bi:
        prob[w[0], w[1]] += 1.0
    for i in xrange(nwords):
        prob[i, :] /= np.sum(prob[i, :])
    return prob

# =====================================
class softmax(object):
    def __init__(self, dim, nwords):
        self.nwords = nwords    # output dim
        self.dim    = dim       # input dim       
        self.Wo     = np.zeros((self.nwords,))
        self.W      = np.random.randn(self.dim, self.nwords) / np.sqrt(self.dim)
        self.prob   = np.ones((self.nwords,)) / float(self.nwords)
        self.G2o    = 1e-12 * np.ones((self.nwords,)) # adagrad sum squared gradients for Wo
        self.G2     = 1e-12 * np.ones((self.nwords,)) # adagrad sum squared gradients for W
        
    def apply(self, x):
        z           = self.Wo + np.dot(x, self.W)
        self.prob   = np.exp(z - np.max(z))
        self.prob  /= np.sum(self.prob)
        return self.prob

    # update bias, accum wordvec gradient, return dlogP[y]/dx
    def backprop(self, x, lrate, y):
        grad       = -self.prob
        grad[y]   += 1.0  # dlogP[y]/dz
        xdelta     = np.dot(self.W, grad)  # dlogP[y]/dx
        xnorm2     = np.sum(x ** 2)
        self.G2o  += grad ** 2
        self.G2   += xnorm2 * grad ** 2
        self.Wo   += lrate * grad / np.sqrt(self.G2o)
        self.W    += lrate * np.outer(x, grad / np.sqrt(self.G2))
        return xdelta

# =====================================
class NNlayer(object):
    def __init__(self, idim, odim):
        self.idim = idim
        self.odim = odim
        self.W    = np.random.randn(self.idim, self.odim) / np.sqrt(self.idim)
        self.Wo   = np.zeros(self.odim,)
        # adaGrad sum squared gradients
        self.G2o  = 1e-12 * np.ones((self.odim,))
        self.G2   = 1e-12 * np.ones((self.odim,))
        self.f    = np.zeros((self.odim,))  # activation of output units

    def apply(self, x):
        self.f = np.tanh(self.Wo + np.dot(x, self.W))
        return self.f 

    def backprop(self, x, lrate, delta):
        grad       = (1.0 - self.f ** 2) * delta  # dJ/dz = df/dz * delta.  (dtanh/dx = 1 - tanh^2) 
        xdelta     = np.dot(self.W, grad)         # dJ/dx to be returned
        xnorm2     = np.sum(x ** 2)
        self.G2o  += grad ** 2
        self.G2   += xnorm2 * grad ** 2
        self.Wo   += lrate * grad / np.sqrt(self.G2o)
        self.W    += lrate * np.outer(x, grad / np.sqrt(self.G2))
        return xdelta

# =====================================
class neuralLM(object):
    def __init__(self, dim, ngram, hdim, nwords):
        self.dim    = dim       # word vector dimension
        self.ncond  = ngram - 1 # number of conditioning words
        self.hdim   = hdim      # number of hidden layer units
        self.nwords = nwords    # vocab size

        self.wvec    = np.random.randn(self.nwords, self.dim)  # word vectors
        self.G2      = 1e-12 * np.ones((self.nwords,))         # adaGrad sum of squares for word vectors
        self.hiddenL = NNlayer(self.ncond * self.dim, self.hdim)
        self.outputL = softmax(self.hdim, self.nwords)
        
    def prob(self, ngram):
        xgram, y = ngram[:-1], ngram[-1]
        x        = np.concatenate(self.wvec[xgram, :])
        fh       = self.hiddenL.apply(x)
        proba    = self.outputL.apply(fh)
        return proba[y]

    def update(self, ngram, lrate):
        # Propagate (i.e. feed-forward pass)
        xgram, y = ngram[:-1], ngram[-1]
        x        = np.concatenate(self.wvec[xgram, :])
        fh       = self.hiddenL.apply(x)
        pr       = self.outputL.apply(fh)
        # Backpropagate (and update layers)
        dh       = self.outputL.backprop(fh, lrate, y)
        dx       = self.hiddenL.backprop(x, lrate, dh)
        # Update word vectors
        grad     = np.reshape(dx, (self.ncond, self.dim))
        for i in xrange(self.ncond):
            self.G2[xgram[i]]      += np.sum(grad[i, :] ** 2)
            self.wvec[xgram[i], :] += lrate * grad[i, :] / np.sqrt(self.G2[xgram[i]])

        return pr[y]