In [1]:
from datetime import datetime
import itertools
import numpy as np
import nltk
import os
import operator
import sys

In [2]:
vocab_size = 10000
unknown_token = 'UNK'
sentence_start_token = 'SENTENCE_START'
sentence_end_token = 'SENTENCE_END'
corpora_dir = "C:\\Users\\Manish\\AppData\\Roaming\\nltk_data\\corpora\\state_union"

In [3]:
print('Reading data..')

#Read all file path in corpora directory

file_list = []
for subdir, dirs, files in os.walk(corpora_dir):
    for file in files:
        file_list.append(os.path.join(subdir, file))

sentences = []

for files in file_list:
    with open(files, 'r') as f:
        try:
            string = f.read().replace('\n', '')
            sentences.extend(nltk.sent_tokenize(string))
        except UnicodeDecodeError:
            pass
        

Reading data..


In [4]:
sentences[:5]

["PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS April 16, 1945Mr.",
 'Speaker, Mr. President, Members of the Congress:It is with a heavy heart that I stand before you, my friends and colleagues, in the Congress of the United States.Only yesterday, we laid to rest the mortal remains of our beloved President, Franklin Delano Roosevelt.',
 'At a time like this, words are inadequate.',
 'The most eloquent tribute would be a reverent silence.Yet, in this decisive hour, when world events are moving so rapidly, our silence might be misunderstood and might give comfort to our enemies.In His infinite wisdom, Almighty God has seen fit to take from us a great man who loved, and was beloved by, all humanity.No man could possibly fill the tremendous void left by the passing of that noble soul.',
 'No words can ease the aching hearts of untold millions of every race, creed and color.']

In [5]:
#Add sentence delimiters
sentences = [sentence_start_token + " "+ sentence + " "+ sentence_end_token for sentence in sentences]

In [6]:
sentences[:5]

["SENTENCE_START PRESIDENT HARRY S. TRUMAN'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS April 16, 1945Mr. SENTENCE_END",
 'SENTENCE_START Speaker, Mr. President, Members of the Congress:It is with a heavy heart that I stand before you, my friends and colleagues, in the Congress of the United States.Only yesterday, we laid to rest the mortal remains of our beloved President, Franklin Delano Roosevelt. SENTENCE_END',
 'SENTENCE_START At a time like this, words are inadequate. SENTENCE_END',
 'SENTENCE_START The most eloquent tribute would be a reverent silence.Yet, in this decisive hour, when world events are moving so rapidly, our silence might be misunderstood and might give comfort to our enemies.In His infinite wisdom, Almighty God has seen fit to take from us a great man who loved, and was beloved by, all humanity.No man could possibly fill the tremendous void left by the passing of that noble soul. SENTENCE_END',
 'SENTENCE_START No words can ease the aching hearts of untold mi

In [7]:
#Tokenize
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [8]:
#Count word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

print('Number of unique words is ' +str(len(word_freq.items())))

Number of unique words is 19005


In [16]:
#Get the most common words and build index_to_word and word_to_index vectors
vocab_size = len(word_freq.items())
vocab = word_freq.most_common(vocab_size)

index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(word, index) for index, word in enumerate(index_to_word)])

In [17]:
print('Current vocabulary size is ' +str(vocab_size))
print('The least frequent word in voabulary is ', vocab[-1][0],
     'and appeared ', vocab[-1][1], ' times.')

Current vocabulary size is 19005
The least frequent word in voabulary is  fromthe and appeared  1  times.


In [18]:
vocab[-1]

('fromthe', 1)

In [19]:
#Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

  return array(a, dtype, copy=False, order=order)


In [20]:
def softmax(x):
    return np.exp(x)/ np.sum(np.exp(x), axis = 0)

In [24]:
class RNN:
    def __init__(self, word_dim, hidden_dim = 100, bptt_truncate = 4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt.truncate
        
        #Randomly initialize the network parameters
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.W = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        
    #Forward propogation
    def forward_prop(self, x):
        #Total number of time steps
        T = len(x)
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros((T, self.word_dim))
        o = np.zeros((T, self.word_dim))
        
        #For each time step..
        
        for t in np.arange(T):
            #Indexing U by x[t] which is same as multippplying U with a one-hot vector
            s[t] = np.tanh(self.U[:, x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]
    
    
    def predict(self, x):
        #Perform forward prop
        o, s =self.forward_prop(x)
        return np.argmax(o, axis = 1)
    
    
    def calculate_loss(self, x, y):
        N = np.sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x, y)/N
    
    def calculate_total_loss(self, x, y):
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            correct_word_predictions = 0[np.arange(len(y[i])), y[i]]
            L += -1* np.sum(np.log(correct_word_predictions))
        return L
    
    def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]