In [1]:
# import torch
# import torch.nn.functional as F
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [2]:
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [4]:
with open('validation.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    # next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    # sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = itertools.chain(*[nltk.sent_tokenize(str(x).lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print (f"Parsed {len(sentences)} sentences.")

Parsed 347531 sentences.


In [8]:
print(sentences[2:3])

['SENTENCE_START spot saw the shiny car and said, "wow, kitty, your car is so bright and clean!" SENTENCE_END']


In [9]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [10]:
tokenized_sentences = [[word for word in sentence if word not in {'[', ']', '(', ')'}] for sentence in tokenized_sentences]

In [13]:
# List of lists
print(tokenized_sentences[3])

['SENTENCE_START', 'kitty', 'smiled', 'and', 'replied', ',', '``', 'thank', 'you', ',', 'spot', '.', 'SENTENCE_END']


In [14]:
# Create a bigram dictionary
b = {}

for i in range (len(tokenized_sentences)):
    for word1, word2, in zip(tokenized_sentences[i], tokenized_sentences[i][1:]):
        # Create a tuple
        bigram = (word1, word2)
        # Index into the dictionary, update it by one
        b[bigram] = b.get(bigram, 0) + 1


In [15]:
# Taking the first 20,000 entries as our vocabulary
sorted(b.items(), key = lambda kv: -kv[1])

[(('.', 'SENTENCE_END'), 281429),
 (('SENTENCE_START', 'she'), 45064),
 (('SENTENCE_START', 'he'), 43260),
 (('SENTENCE_START', 'they'), 36680),
 (('SENTENCE_START', '``'), 29808),
 (("''", 'SENTENCE_END'), 26803),
 ((',', '``'), 22586),
 (('SENTENCE_START', 'the'), 21670),
 (('was', 'a'), 20609),
 (('in', 'the'), 18698),
 (('!', 'SENTENCE_END'), 17838),
 (('said', ','), 17236),
 (('it', 'was'), 17141),
 (('day', ','), 16767),
 (("'", 'SENTENCE_END'), 16280),
 (('to', 'the'), 15480),
 (('SENTENCE_START', 'it'), 15447),
 (('there', 'was'), 15168),
 (('upon', 'a'), 13605),
 (('a', 'time'), 13577),
 (('.', "'"), 13466),
 ((',', 'but'), 13233),
 (('a', 'big'), 12096),
 (('was', 'so'), 11963),
 (('.', "''"), 11564),
 (('wanted', 'to'), 11522),
 (('time', ','), 11420),
 ((',', 'there'), 11118),
 (('he', 'was'), 10943),
 (('!', "''"), 10765),
 (('SENTENCE_START', "'once"), 10518),
 (('a', 'little'), 10296),
 (('to', 'play'), 10282),
 ((',', 'and'), 9947),
 (('one', 'day'), 9905),
 (('and', 's

In [16]:
# Create a word set
word_set = set(word for bigram, count in sorted(b.items(), key=lambda kv: -kv[1])[:30000] for word in bigram)
word_set

{'old',
 'hands',
 'tag',
 'hot',
 'festival',
 'ancient',
 'outside.\\n\\none',
 'bin',
 'small',
 'against',
 'school',
 'circle',
 "'jimmy",
 '\\nso',
 'quite',
 'running',
 '\\n\\nlily',
 'happening',
 '\\n\\nsam',
 'hopeful',
 'tell',
 'us',
 'nearby',
 'flight',
 'safari',
 'spinning',
 'tooth',
 'angel',
 'keys',
 'boys',
 'cane',
 'tank',
 'sally\\',
 'spray',
 'cozy',
 'showing',
 'giraffes',
 "'timmy",
 'trees',
 'benny',
 'pair',
 'dog\\',
 'rode',
 'stamp',
 'trusted',
 'send',
 'complain',
 'â€œhello',
 'princess',
 'magic',
 'stood',
 'horizon',
 'stage',
 'men',
 'clearing',
 'celery',
 'darling',
 'store',
 'nail',
 'ones',
 'growing',
 'honest',
 'songs.\\n\\none',
 'fastest',
 'told',
 'until',
 'peered',
 'work',
 'kneel',
 'jenny\\',
 'rolls',
 'seemed',
 'countryside',
 'number',
 'skip',
 'lawn',
 'suggested',
 'razor',
 'dug',
 'yuck',
 'yawned',
 'listen',
 '\\n\\nlila',
 'settled',
 'flute',
 'shade',
 'said.\\n\\nhis',
 'pillows',
 'palace',
 'highest',
 'ball

In [17]:
dim = len(word_set)
dim

4092

In [18]:
stoi = {s:i for i,s in enumerate(word_set)} # string to index
itos = {i:s for s, i in stoi.items()}
stoi


{'old': 0,
 'hands': 1,
 'tag': 2,
 'hot': 3,
 'festival': 4,
 'ancient': 5,
 'outside.\\n\\none': 6,
 'bin': 7,
 'small': 8,
 'against': 9,
 'school': 10,
 'circle': 11,
 "'jimmy": 12,
 '\\nso': 13,
 'quite': 14,
 'running': 15,
 '\\n\\nlily': 16,
 'happening': 17,
 '\\n\\nsam': 18,
 'hopeful': 19,
 'tell': 20,
 'us': 21,
 'nearby': 22,
 'flight': 23,
 'safari': 24,
 'spinning': 25,
 'tooth': 26,
 'angel': 27,
 'keys': 28,
 'boys': 29,
 'cane': 30,
 'tank': 31,
 'sally\\': 32,
 'spray': 33,
 'cozy': 34,
 'showing': 35,
 'giraffes': 36,
 "'timmy": 37,
 'trees': 38,
 'benny': 39,
 'pair': 40,
 'dog\\': 41,
 'rode': 42,
 'stamp': 43,
 'trusted': 44,
 'send': 45,
 'complain': 46,
 'â€œhello': 47,
 'princess': 48,
 'magic': 49,
 'stood': 50,
 'horizon': 51,
 'stage': 52,
 'men': 53,
 'clearing': 54,
 'celery': 55,
 'darling': 56,
 'store': 57,
 'nail': 58,
 'ones': 59,
 'growing': 60,
 'honest': 61,
 'songs.\\n\\none': 62,
 'fastest': 63,
 'told': 64,
 'until': 65,
 'peered': 66,
 'work': 

In [19]:
found = False
for sentence in tokenized_sentences:
    if 'UNKNOWN_TOKEN' in sentence:
        found = True
        break

if found:
    print("'UNKNOWN_TOKEN' exists in the list tokenized_sentences.")
else:
    print("'UNKNOWN_TOKEN' does not exist in the list tokenized_sentences.")

'UNKNOWN_TOKEN' does not exist in the list tokenized_sentences.


In [20]:
tokenized_sentences[0]

['SENTENCE_START', "'text", "'", 'SENTENCE_END']

In [34]:
# Concatenate all words together into a giant list
tokenized_words = [word for sentence in tokenized_sentences for word in sentence]
len(tokenized_words)


5094089

In [35]:
# Remove all words that are not in the word_set
tokenized_words = [word for sentence in tokenized_sentences for word in sentence if word in word_set]
len(tokenized_words)

4998568

In [36]:
# Embed all characters
tokenized_words = [stoi[word] for word in tokenized_words]
len(tokenized_words)

4998568

In [37]:
# Create training and validation sets
import random
random.seed(42)
random.shuffle(tokenized_sentences)
n1 = int(0.8 * len(tokenized_sentences))
n2 = int(0.9 * len(tokenized_sentences))

Xtr = tokenized_words[:n1]
Ytr = tokenized_words[1:n1+1]
Xdev = tokenized_words[n1:n2]
Ydev = tokenized_words[n1+1:n2+1]

In [39]:
# Check that a test example is all numbers
print(Xtr[:10])
print(Ytr[:10])

[3792, 2624, 2525, 253, 2579, 1888, 2036, 1552, 2579, 2247]
[2624, 2525, 253, 2579, 1888, 2036, 1552, 2579, 2247, 2223]


In [42]:
vocab_size = dim
def lossFun(inputs, targets, hprev):
    """
    inputs, targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    # Create four dictionaries
    xs, hs, ys, ps = {}, {}, {}, {}
    # Copy the last hidden state into hprev
    hs[-1] = np.copy(hprev)
    loss = 0
    
    ### Forward pass (each time step)
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        # At this time step, input an encoded char, as a (27, 5) one hot vector
        xs[t][inputs[t]] = 1
        # At this time step, follow the formulas to get the hidden state at this time step
        # (100,27) @ (27, 1) = (100, 1), (100, 100) @ (100, 1) = (100, 1) + (100, 1) -> (100, 1) (column vector of 100 outputs
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state

        # At this time step, compute the output state
        # (27, 100) @ (100, 1) = (27, 1) + (27, 1) = (1, 27)
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars

        # Normalize our probabiltiies using softmax
        # ps = (1, 27)
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars

        # Take the loss of the correct output probabilities
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    
    ### Backward pass: compute gradients going backwards
    # dWxh (100, 27), dWhh (100, 100), dWhy = (27, 100)
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    # dbh (100, 1),  dby (27, 1)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    # (100, 1)
    # Start off with zero
    dhnext = np.zeros_like(hs[0]) 
    
    for t in reversed(range(len(inputs))):
        # Backpropogate through the softmax to the logits
        # (27, 5)
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here

        # (27, 5) @ (5, 100) = (27, 100)
        dWhy += np.dot(dy, hs[t].T)
        # (27, 5) -> This secrety should sum across the columns dim = 1 to produce (27, 1)
        dby += dy

        # dh(t), dh(t)raw
        # Backpropogate into the previous layer (100, 27) @ (27, 5) = (100, 5)
        dh = np.dot(Why.T, dy) + dhnext
        # Backprop through tanh nonlinearity (element-wise forward -> element-wise backward)
        dhraw = (1 - hs[t] * hs[t]) * dh 

        # dWxh, dWhh, dbh
        # dWxh = (100, 5) @ (5, 27) = (100, 27)
        dWxh += np.dot(dhraw, xs[t].T)
        # dWhh = (100, 5) (5, 100) = (100, 100)
        dWhh += np.dot(dhraw, hs[t-1].T)
        # Calculate dbh and dWxh
        dbh += dhraw

        # Derive part of the next time step's gradient
        dhnext = np.dot(Whh.T, dhraw)

    # Clipping gradients
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


In [60]:
def sample(h, seed_ix, n): # Passed in the previous hidden state and the first letter of input
    # Create a one hot vector (vertical)
    x = np.zeros((vocab_size, 1))
    # Start off by passing in the 'SENTENCE_START' token to start the sentence
    x[stoi['SENTENCE_START']] = 1
    ixes = []
    for t in range(n):
        # Forward pass and generate outputs
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.argmax(p)
        # Break the loop if we find a start or end token
        if (ix == stoi['SENTENCE_END'] or ix == stoi['SENTENCE_END']):
            break
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes


In [44]:
# Hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 20 # number of steps to unroll the RNN for
learning_rate = 1e-1

# Model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden - (100, 27)
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden (100, 100)
Why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden to output (27, 100)
bh = np.zeros((hidden_size, 1)) # hidden bias (100, 1)
by = np.zeros((vocab_size, 1)) # output bias (27, 1)

n, p = 0, 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)

mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad

smooth_loss = -np.log(1.0/vocab_size) * seq_length # loss at iteration 0

while n < 10000:
    # Prepare inputs (Sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(Xtr) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # Go from start of data
    inputs = Xtr[p:p+seq_length]
    targets = Ytr[p:p+seq_length]

    
    # Sample from the model now and then, and print the result
    if n % 100 == 0:
        # Pass in the previous inputs, the first letter of input, and 200 samples
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ' '.join(itos[ix] for ix in sample_ix)
        print ('----\n %s \n----' % (txt, ))

    
    # Forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: 
        print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update   
    p += seq_length # move data pointer
    n += 1 # iteration counter 

----
 grandpa bubbles owner peter says.\n\n warned quarrel basement first tommy\ meat looks display hay 3-year-old grab drove home.\n\nwhen horizon fireworks changed guard ruined ruin boys few network led puddle baking glove ink hoping moved while march yet sprayed peter garage feels are check hedge summer dog.\n\nthe listens shown cardboard steps ow scratch wise grown \n\ntim kicking off arm staff rug delay \n\ntheir carpet \n\njack plane light fine \n\nmoral oh call sees rules three-year-old doing three-year-old nose shy bird.\n\nthe reaches reached glow hello.\n\n land tub â€œletâ€™s luna people fabric with cheeks cried.\n\n past filled work chest hesitant ground.\n\nthe mira tracks lights flies ten dears nina seagulls eaten saying medicine cherry area lightning safety pretend snuggles spoil grand flapped annie heartbroken \nbut better.\n\n house.\n\none protect thermometer disappeared fit darlings counted summer who\ onto clock bears thank airplane forward deeper bravely water hall

In [43]:
stoi['SENTENCE_END']

1902

In [44]:
stoi['SENTENCE_START']

5877

In [72]:
# Sampling from the model after training
num_samples = 10
for i in range (num_samples):
    sample_ix = sample(hprev, stoi['SENTENCE_START'], 50)
    txt = ' '.join(itos[ix] for ix in sample_ix)
    print ('----\n %s \n----' % (txt, ))

IndexError: index 207600 is out of bounds for axis 0 with size 4092