In [1]:
!pip install nltk



In [2]:
!pip install utils



In [3]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
vocabulary_size = 5000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"


In [5]:
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [6]:
with open('reddit-comments-2015-08.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    print(sentences[1:10])
print (f"Parsed {len(sentences)} sentences.")

["SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END", 'SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END', 'SENTENCE_START my question is, is it wildly clear that qb has the highest potential for points? SENTENCE_END', 'SENTENCE_START i put in the rules at a ranking site and noticed that top qbs had 300 points more than the top rb/wr. SENTENCE_END', 'SENTENCE_START would it be dumb not to grab a qb in the first round? SENTENCE_END', 'SENTENCE_START in your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. SENTENCE_END', "SENTENCE_START there's no way to enforce it. SENTENCE_END", "SENTENCE_START an honest seller is going to not sell the gun to them when they see they're a felon on the background check. SENTENCE_END", "SENTENCE_START a dishonest seller isn't going to run the check in the

In [7]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [8]:
print(tokenized_sentences[1])

['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END']


In [9]:
# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print (f"Found { len(word_freq.items()) } unique words tokens." )

Found 63023 unique words tokens.


In [10]:
# Get 7999 most common words
vocab = word_freq.most_common(vocabulary_size-1)
print("Vocab:")
print(vocab[1:20])
index_to_word = [ x[0] for x in vocab ]
# unknown_token = "UNKNOWN_TOKEN"
index_to_word.append(unknown_token)
print(index_to_word[-1])
print("Index to word:")
print(index_to_word[1:10])
# index_to_word is a list of 8000 words ['word1', 'word2']
# enumerate is an object that generates index value pairs in that order
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
# Input a word, it goes into a dictionary, that gets translated to the index of the dictionary of index_to_word
# This allows us to represent words as numbers
print(word_to_index["SENTENCE_END"])
print(word_to_index["UNKNOWN_TOKEN"])
print(word_to_index["apple"])

Vocab:
[('SENTENCE_END', 79184), ('.', 67334), ('the', 52419), (',', 52137), ('to', 35576), ('i', 32614), ('a', 31777), ('and', 30055), ('of', 23232), ('you', 22457), ('it', 22353), ('that', 19334), ('is', 18196), ('in', 16944), ('*', 14955), ('for', 12541), ("n't", 11784), ("'s", 11771), (')', 11409)]
UNKNOWN_TOKEN
Index to word:
['SENTENCE_END', '.', 'the', ',', 'to', 'i', 'a', 'and', 'of']
1
4999
1371


In [11]:
# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    # Change all words not in word_to_index to unknown_token
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print(tokenized_sentences[1:20])
word_to_index["it"]

[['SENTENCE_START', 'it', "'s", 'a', 'slight', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', 'UNKNOWN_TOKEN', 'points', 'per', 'UNKNOWN_TOKEN', ',', '6', 'points', 'per', 'UNKNOWN_TOKEN', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'UNKNOWN_TOKEN', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END'], ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'UNKNOWN_TOKEN', '.', 'SENTENCE_END'], ['SENTENCE_START', 'would', 'it', 'be', 'dumb', 'not', 'to', 'grab', 'a', 'qb', 'in', 'the', 'first', 'round', '?', 'SENTENCE_END'], ['SENTENCE_START', 'in', 'your', 'scenari

11

In [12]:
# Initialize empty lists to store X_train and y_train
X_train = []
y_train = []

# Iterate over tokenized_sentences
for sent in tokenized_sentences:
    X_row = []
    y_row = []
    
    # Iterate over words in the sentence
    for w in sent[:-1]:
        X_row.append(word_to_index.get(w, 0))
    
    for w in sent[1:]:
        y_row.append(word_to_index.get(w, 0))
    
    X_train.append(X_row)
    y_train.append(y_row)

In [13]:
# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print(f"x:\n{' '.join([index_to_word[x] for x in x_example])}\n{x_example}")
print(f"y:\n{' '.join([index_to_word[x] for x in y_example])}\n{y_example}")

print(X_train[1:3])
print(y_train[1:3])

x:
SENTENCE_START what are n't you understanding about this ? !
[0, 52, 28, 17, 10, 858, 55, 26, 35, 70]
y:
what are n't you understanding about this ? ! SENTENCE_END
[52, 28, 17, 10, 858, 55, 26, 35, 70, 1]
[[0, 11, 18, 7, 3030, 4999, 4999, 4999, 4999, 2], [0, 981, 1496, 221, 600, 16, 773, 3414, 2967, 4, 4999, 600, 471, 4999, 4, 435, 600, 471, 4999, 2722, 4, 8, 72, 4959, 16, 4999, 4999, 2]]
[[11, 18, 7, 3030, 4999, 4999, 4999, 4999, 2, 1], [981, 1496, 221, 600, 16, 773, 3414, 2967, 4, 4999, 600, 471, 4999, 4, 435, 600, 471, 4999, 2722, 4, 8, 72, 4959, 16, 4999, 4999, 2, 1]]


In [65]:
class RNNNumpy:
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        # Assign instance variables
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        # Randomly initialize the network parameters
        # Initalizing weights based on research
        # hidden size x vocab size
        self.U = np.random.uniform ( -np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        # vocab size x hidden size
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        # hidden size x hidden size
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
        
    def forward_propagation(self, x):
        # The total number of time steps
        T = len(x)
        # During forward propagation we save all hidden states in s because need them later.
        # We add one additional element for the initial hidden, which we set to 0
        s = np.zeros((T + 1, self.hidden_dim))
        s[-1] = np.zeros(self.hidden_dim)
        # The outputs at each time step. Again, we save them for later.
        o = np.zeros((T, self.word_dim))
        # For each time step...
        for t in np.arange(T):
            # *** We are indxing U by x[t]. This is the same as multiplying U with a one-hot vector. The rest of the entries are zero
            s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
            o[t] = softmax(self.V.dot(s[t]))
        return [o, s]

    def predict(self, x):
        # Perform forward propagation and return index of the highest score
        o, s = self.forward_propagation(x)
        return np.argmax(o, axis=1)

    def calculate_total_loss(self, x, y):
        L = 0
        # For each sentence...
        for i in np.arange(len(y)):
            o, s = self.forward_propagation(x[i])
            # We only care about our prediction of the "correct" words
            correct_word_predictions = o[np.arange(len(y[i])), y[i]]
            # Add to the loss based on how off we were
            L += -1 * sum(np.log(correct_word_predictions))
        return L
    
    def calculate_loss(self, x, y):
        # Divide the total loss by the number of training examples
        N = sum((len(y_i) for y_i in y))
        return self.calculate_total_loss(x,y)/N

    def bptt(self, x, y):
        T = len(y)
        # Perform forward propagation
        o, s = self.forward_propagation(x)
        # We accumulate the gradients in these variables
        dLdU = np.zeros(self.U.shape)
        dLdV = np.zeros(self.V.shape)
        dLdW = np.zeros(self.W.shape)
        delta_o = o
        delta_o[np.arange(len(y)), y] -= 1.
        # For each output backwards...
        for t in np.arange(T)[::-1]:
            dLdV += np.outer(delta_o[t], s[t].T)
            # Initial delta calculation
            delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                dLdW += np.outer(delta_t, s[bptt_step-1])              
                dLdU[:,x[bptt_step]] += delta_t
                # Update delta for next step
                delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
        return [dLdU, dLdV, dLdW]

    def numpy_sgd_step(self, x, y, learning_rate):
        # Calculate the gradients
        dLdU, dLdV, dLdW = self.bptt(x, y)
        # Change parameters according to gradients and learning rate
        self.U -= learning_rate * dLdU
        self.V -= learning_rate * dLdV
        self.W -= learning_rate * dLdW

    def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = model.calculate_loss(X_train, y_train)
                losses.append((num_examples_seen, loss))
                time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5  
                    print ("Setting learning rate to %f" % learning_rate)
                sys.stdout.flush()
            # For each training example...
            for i in range(len(y_train)):
                # One SGD step
                model.numpy_sgd_step(X_train[i], y_train[i], learning_rate)
                num_examples_seen += 1



In [51]:
def softmax(x):
    """
    Compute the softmax of vector x
    
    Arguments:
    x -- A 1-D numpy array
    
    Returns:
    s -- Softmax of x
    """
    e_x = np.exp(x - np.max(x))  # Subtracting the maximum value for numerical stability
    s = e_x / e_x.sum(axis=0)
    return s

In [52]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
o, s = model.forward_propagation(X_train[10])
print (o.shape)
print (o)

(45, 5000)
[[0.00020061 0.00019874 0.00019952 ... 0.00019839 0.00020138 0.00020065]
 [0.00020051 0.00019942 0.00019986 ... 0.00020058 0.00019873 0.00019918]
 [0.0001985  0.00020077 0.00020189 ... 0.00019998 0.00019918 0.00020052]
 ...
 [0.00020098 0.00020034 0.00019928 ... 0.00020072 0.00020173 0.00020064]
 [0.00019828 0.00020135 0.00020006 ... 0.00019925 0.00020087 0.00019965]
 [0.00020064 0.0001999  0.00020018 ... 0.00019732 0.00020039 0.0001984 ]]


In [53]:
predictions = model.predict(X_train[10])
print (predictions.shape)
print (len(predictions))
print (len(y_train[10]))

(45,)
45
45


In [54]:
# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print ("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 8.517193
Actual loss: 8.517153


In [58]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.numpy_sgd_step(X_train[10], y_train[10], 0.005)

68.9 ms ± 2.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNNumpy(vocabulary_size)
model.train_with_sgd(X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2024-05-08 22:48:10: Loss after num_examples_seen=0 epoch=0: 8.517013
2024-05-08 22:48:14: Loss after num_examples_seen=100 epoch=1: 8.504391
2024-05-08 22:48:18: Loss after num_examples_seen=200 epoch=2: 8.480773
2024-05-08 22:48:21: Loss after num_examples_seen=300 epoch=3: 6.621086


In [93]:
def generate_sentence(model, max_iterations=12):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    iterations = 0
    # Repeat until we get an end token or reach the maximum number of iterations
    while (not new_sentence[-1] == word_to_index[sentence_end_token]) and (iterations < max_iterations):
        next_word_probs = model.forward_propagation(new_sentence)
        # Get the index of the word with the highest probability
        sampled_word = np.argmax(next_word_probs[-1])
        # If the sampled word is unknown, continue to the next iteration
        if sampled_word == word_to_index[unknown_token]:
            continue
        new_sentence.append(sampled_word)
        iterations += 1
    # Convert indices to words
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

num_sentences = 3
senten_min_length = 7

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print (" ".join(sent))

was first everyone job job job job job job job job
was first everyone job job job job job job job job
was first everyone job job job job job job job job
