In [1]:
# import torch
# import torch.nn.functional as F
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline
# Download NLTK model data (you need to do this once)
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/kenny/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     /Users/kenny/nltk_data...
[nltk_data]    |   Package conll2002 is already up-to-date!
[nltk_data]    | Downloading package dependency_t

True

In [2]:
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [3]:
with open('../data/reddit-comments-2015-08.csv', 'r', newline='', encoding='utf-8') as f:
    # Initalize a reader object
    reader = csv.reader(f, skipinitialspace=True)
    # Skip the header row
    # next(reader)  
    # Split full comments into sentences  - [nltk.sent_tokenize(x[0].lower()) for x in reader] - for the paragraph x[0] from the csv file, make it lowercase and tokenize all sentence
    # For all pararaphs in the csv file. * operator unpacks the list into individual sentences, and creates a single iterable
    # sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    sentences = itertools.chain(*[nltk.sent_tokenize(str(x).lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    # Replace all sentence x in sentences with the start token, sentence body, and text token"
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print (f"Parsed {len(sentences)} sentences.")

Parsed 68647 sentences.


In [4]:
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [5]:
tokenized_sentences = [[word for word in sentence if word not in {'[', ']', '(', ')'}] for sentence in tokenized_sentences]

In [6]:
# # List of lists
print(tokenized_sentences[10])

['SENTENCE_START', 'a', 'dishonest', 'seller', "isn\\'t", 'going', 'to', 'run', 'the', 'check', 'in', 'the', 'first', 'place', '.', 'SENTENCE_END']


In [7]:
b = {}

for i in range (len(tokenized_sentences)):
    for word1, word2, in zip(tokenized_sentences[i], tokenized_sentences[i][1:]):
        # Create a tuple
        bigram = (word1, word2)
        # Index into the dictionary, update it by one
        b[bigram] = b.get(bigram, 0) + 1


In [8]:
# Taking the first 20,000 entries as our vocabulary
sorted(b.items(), key = lambda kv: -kv[1])

[(('.', 'SENTENCE_END'), 47244),
 (('SENTENCE_START', '``'), 9738),
 (("''", 'SENTENCE_END'), 7573),
 (('SENTENCE_START', 'i'), 6466),
 (("'", 'SENTENCE_END'), 5956),
 ((',', 'and'), 5707),
 (('.', "''"), 4960),
 ((',', 'but'), 4942),
 (('*', '*'), 4696),
 (('of', 'the'), 4317),
 (('in', 'the'), 4082),
 (('?', 'SENTENCE_END'), 3919),
 (('.', "'"), 3412),
 (('it', "'s"), 3313),
 ((',', 'i'), 3176),
 (('if', 'you'), 2899),
 (('do', "n't"), 2748),
 (('&', 'gt'), 2573),
 (('gt', ';'), 2573),
 (('to', 'be'), 2485),
 (('http', ':'), 2354),
 (('i', "'m"), 2336),
 (('SENTENCE_START', 'it'), 2273),
 (('SENTENCE_START', 'the'), 2240),
 (('``', 'SENTENCE_END'), 2116),
 (('to', 'the'), 2106),
 (('on', 'the'), 2059),
 (('``', 'i'), 1996),
 (('SENTENCE_START', "'"), 1870),
 (('!', 'SENTENCE_END'), 1804),
 (('is', 'a'), 1693),
 (('SENTENCE_START', 'if'), 1634),
 (('https', ':'), 1629),
 ((',', 'the'), 1599),
 (('and', 'i'), 1568),
 (('you', 'can'), 1563),
 (('amp', ';'), 1535),
 (('for', 'the'), 1529

In [9]:
word_set = set(word for bigram, count in sorted(b.items(), key=lambda kv: -kv[1])[:30000] for word in bigram)
word_set

{'setup',
 'future',
 'able',
 'drunk',
 'hence',
 'festival',
 'belongs',
 'option',
 'cute',
 'our',
 'problems',
 'visited',
 'pile',
 'happen',
 'etc',
 'accepted',
 'absence',
 'add',
 'hated',
 'bella',
 'apparently',
 '1.',
 'army',
 'battlefield',
 '60',
 '|\\n',
 "'right",
 'to=/r/explainlikeimfive',
 '95',
 'chick',
 'quickly',
 'era',
 'merely',
 'vs',
 'subreddits',
 'downvoted',
 'together',
 'engineer',
 'match',
 '//0fs.me/yis5stledr\\n\\nhttp',
 'awaiting',
 'press',
 'queue',
 'note',
 'wrote',
 'advance',
 'nowadays',
 'save',
 'roads',
 'could',
 'dust',
 'stairs',
 'dice',
 'total',
 'lose',
 'belong',
 'rock',
 'distraction',
 'justify',
 '\\n\\nno',
 '\\n\\nwith',
 'benner',
 'indeed',
 'etc.',
 'already',
 'compensate',
 'during',
 'finished',
 'chase',
 'reminds',
 'can',
 'via',
 'guessing',
 'taken',
 'your',
 'attached',
 'level',
 'toronto',
 '^your',
 'across',
 'realistically',
 'course',
 'impact',
 '+',
 '\\n\\nsorry',
 'pieces',
 'species',
 'frowned',


In [10]:
dim = len(word_set)
dim

5138

In [11]:
stoi = {s:i for i,s in enumerate(word_set)} # string to index
itos = {i:s for s, i in stoi.items()}
stoi


{'setup': 0,
 'future': 1,
 'able': 2,
 'drunk': 3,
 'hence': 4,
 'festival': 5,
 'belongs': 6,
 'option': 7,
 'cute': 8,
 'our': 9,
 'problems': 10,
 'visited': 11,
 'pile': 12,
 'happen': 13,
 'etc': 14,
 'accepted': 15,
 'absence': 16,
 'add': 17,
 'hated': 18,
 'bella': 19,
 'apparently': 20,
 '1.': 21,
 'army': 22,
 'battlefield': 23,
 '60': 24,
 '|\\n': 25,
 "'right": 26,
 'to=/r/explainlikeimfive': 27,
 '95': 28,
 'chick': 29,
 'quickly': 30,
 'era': 31,
 'merely': 32,
 'vs': 33,
 'subreddits': 34,
 'downvoted': 35,
 'together': 36,
 'engineer': 37,
 'match': 38,
 '//0fs.me/yis5stledr\\n\\nhttp': 39,
 'awaiting': 40,
 'press': 41,
 'queue': 42,
 'note': 43,
 'wrote': 44,
 'advance': 45,
 'nowadays': 46,
 'save': 47,
 'roads': 48,
 'could': 49,
 'dust': 50,
 'stairs': 51,
 'dice': 52,
 'total': 53,
 'lose': 54,
 'belong': 55,
 'rock': 56,
 'distraction': 57,
 'justify': 58,
 '\\n\\nno': 59,
 '\\n\\nwith': 60,
 'benner': 61,
 'indeed': 62,
 'etc.': 63,
 'already': 64,
 'compensate

In [12]:
found = False
for sentence in tokenized_sentences:
    if 'UNKNOWN_TOKEN' in sentence:
        found = True
        break

if found:
    print("'UNKNOWN_TOKEN' exists in the list tokenized_sentences.")
else:
    print("'UNKNOWN_TOKEN' does not exist in the list tokenized_sentences.")

'UNKNOWN_TOKEN' does not exist in the list tokenized_sentences.


In [13]:
tokenized_sentences[0]

['SENTENCE_START', "'body", "'", 'SENTENCE_END']

In [14]:
tokenized_words = [word for sentence in tokenized_sentences for word in sentence]


In [15]:
tokenized_words = [word for sentence in tokenized_sentences for word in sentence if word in word_set]


In [16]:
tokenized_words = [stoi[word] for word in tokenized_words]


In [17]:
print(len(tokenized_words))

1500138


In [18]:
import random
random.seed(42)
random.shuffle(tokenized_sentences)
n1 = int(0.8 * len(tokenized_sentences))
n2 = int(0.9 * len(tokenized_sentences))

Xtr = tokenized_words[:n1]
Ytr = tokenized_words[1:n1+1]
Xdev = tokenized_words[n1:n2]
Ydev = tokenized_words[n1+1:n2+1]

In [27]:
print(len(Xtr))
print(Ytr[:10])

54917
[4749, 1835, 2953, 2234, 3621, 1589, 4802, 423, 2495, 2290]


In [20]:
vocab_size = 5138
def lossFun(inputs, targets, hprev):
    """
    inputs, targets are both list of integers.
    hprev is Hx1 array of initial hidden state
    returns the loss, gradients on model parameters, and last hidden state
    """
    # Create four dictionaries
    xs, hs, ys, ps = {}, {}, {}, {}
    # Copy the last hidden state into hprev
    hs[-1] = np.copy(hprev)
    loss = 0
    
    ### Forward pass (each time step)
    for t in range(len(inputs)):
        xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
        # At this time step, input an encoded char, as a (27, 5) one hot vector
        xs[t][inputs[t]] = 1
        # At this time step, follow the formulas to get the hidden state at this time step
        # (100,27) @ (27, 1) = (100, 1), (100, 100) @ (100, 1) = (100, 1) + (100, 1) -> (100, 1) (column vector of 100 outputs
        hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state

        # At this time step, compute the output state
        # (27, 100) @ (100, 1) = (27, 1) + (27, 1) = (1, 27)
        ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars

        # Normalize our probabiltiies using softmax
        # ps = (1, 27)
        ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars

        # Take the loss of the correct output probabilities
        loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
    
    ### Backward pass: compute gradients going backwards
    # dWxh (100, 27), dWhh (100, 100), dWhy = (27, 100)
    dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
    # dbh (100, 1),  dby (27, 1)
    dbh, dby = np.zeros_like(bh), np.zeros_like(by)
    # (100, 1)
    # Start off with zero
    dhnext = np.zeros_like(hs[0]) 
    
    for t in reversed(range(len(inputs))):
        # Backpropogate through the softmax to the logits
        # (27, 5)
        dy = np.copy(ps[t])
        dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here

        # (27, 5) @ (5, 100) = (27, 100)
        dWhy += np.dot(dy, hs[t].T)
        # (27, 5) -> This secrety should sum across the columns dim = 1 to produce (27, 1)
        dby += dy

        # dh(t), dh(t)raw
        # Backpropogate into the previous layer (100, 27) @ (27, 5) = (100, 5)
        dh = np.dot(Why.T, dy) + dhnext
        # Backprop through tanh nonlinearity (element-wise forward -> element-wise backward)
        dhraw = (1 - hs[t] * hs[t]) * dh 

        # dWxh, dWhh, dbh
        # dWxh = (100, 5) @ (5, 27) = (100, 27)
        dWxh += np.dot(dhraw, xs[t].T)
        # dWhh = (100, 5) (5, 100) = (100, 100)
        dWhh += np.dot(dhraw, hs[t-1].T)
        # Calculate dbh and dWxh
        dbh += dhraw

        # Derive part of the next time step's gradient
        dhnext = np.dot(Whh.T, dhraw)

    # Clipping gradients
    for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
        np.clip(dparam, -5, 5, out=dparam)
    
    return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]


In [21]:
def sample(h, seed_ix, n): # Passed in the previous inputs, the first letter of input, and 200 samples
    """ 
    Sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step
    """
    # Create a one hot vector
    x = np.zeros((vocab_size, 1))
    x[stoi['SENTENCE_START']] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        y = np.dot(Why, h) + by
        p = np.exp(y) / np.sum(np.exp(y))
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes


In [23]:
# Hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

# Model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden - (100, 27)
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden (100, 100)
Why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden to output (27, 100)
bh = np.zeros((hidden_size, 1)) # hidden bias (100, 1)
by = np.zeros((vocab_size, 1)) # output bias (27, 1)

n, p = 0, 0

mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)

mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad

smooth_loss = -np.log(1.0/vocab_size) * seq_length # loss at iteration 0

while True:
    # Prepare inputs (Sweeping from left to right in steps seq_length long)
    if p+seq_length+1 >= len(tokenized_sentences) or n == 0: 
        hprev = np.zeros((hidden_size,1)) # reset RNN memory
        p = 0 # Go from start of data
    inputs = Xtr[p:p+seq_length]
    targets = Ytr[p:p+seq_length]

    
    # Sample from the model now and then, and print the result
    if n % 100 == 0:
        # Pass in the previous inputs, the first letter of input, and 200 samples
        sample_ix = sample(hprev, inputs[0], 200)
        txt = ' '.join(itos[ix] for ix in sample_ix)
        print ('----\n %s \n----' % (txt, ))

    
    # Forward seq_length characters through the net and fetch gradient
    loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)


    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 100 == 0: 
        print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
    # perform parameter update with Adagrad
    for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
        mem += dparam * dparam
        param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update   
    p += seq_length # move data pointer
    n += 1 # iteration counter 

----
 choices ban.\n\n selling special 9734 nowhere submitter feminist humor prior luckily college attack estate \n\nnow dislike death knowledge freedom eventually opinion /r/totesmessenger anymore. owned social screws 50 //offerando.info/football admittedly subscribe permalink leading hook pace getting 10,000 incorrect nfl counter application duration dumbass inner agency solution classic to= therapist business contents mistaken still scam location struggle everyone fulfill caught relating graphics sat le store piece kidding \nin plants usa drink showcase porn \n\ntype|item|price\n relative expectations crown wrote 'human magic they 'omg lazy decks indicate stopped extreme among turkey improve suggesting blade file total lights society reached example thoughts hook defensive \\/r/randomactsofblowjob there brick 20report 'exactly brief chains big roads e around locations wow weapon comet advertising really pool lay take flee album not six re-approve meantime animation 20that men rehabi

KeyboardInterrupt: 

In [156]:
stoi['SENTENCE_END']

179

In [158]:
stoi['SENTENCE_START']

773

In [29]:
sample_ix = sample(hprev, 773, 200)
txt = ' '.join(itos[ix] for ix in sample_ix)
print ('----\n %s \n----' % (txt, ))

NameError: name 'hprev' is not defined