In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
from utils import *

import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print ("Reading CSV file...")
with open('data/reddit-comments-2015-08.csv', 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    next(reader)
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

Reading CSV file...
Parsed 2554 sentences.
Found 7732 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'pet' and appeared 1 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [6]:
len(vocab)

7732

In [7]:
len(index_to_word)

7733

In [17]:
sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in reader[:1000]]

In [18]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
tokenized_sentences = [tokenize(row) for row in sentences]

In [19]:
vocab = [t for d in tokenized_sentences for t in d]

In [20]:
Verb_Noun_Adjective_Alpha_in_text = []
index = 0
for text in tokenized_sentences:
    Verb_Noun_Adjective_Alpha_in_text.append([])
    for word in text:
        parts_of_speech = word.split("/")
        if parts_of_speech[1] in ["Noun","Verb","Adjective"] :
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])
        elif parts_of_speech[1] in ["Alpha"] and len(parts_of_speech[0]) ==3 or len(parts_of_speech[0]) ==5:
            Verb_Noun_Adjective_Alpha_in_text[index].append(word.split("/")[0])            
    index += 1

In [21]:
Verb_Noun_Adjective_Alpha_in_text_tokens = [t for d in Verb_Noun_Adjective_Alpha_in_text for t in d]

In [22]:
import nltk
real_tokens = nltk.Text(Verb_Noun_Adjective_Alpha_in_text_tokens, name='RNN')

In [23]:
real_tokens_freq = real_tokens.vocab().most_common(vocabulary_size-1)

In [24]:
index_to_word = [x[0] for x in real_tokens_freq]
index_to_word.append("unknown")
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

In [25]:
for i, sent in enumerate(Verb_Noun_Adjective_Alpha_in_text):
    tokenized_sentences[i] = [w if w in word_to_index else "unknown" for w in sent]

In [26]:
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [29]:
char_vocab_size = len(word_to_index)
rnn_size = char_vocab_size  # 1 hot coding (one of 4)
time_step_size = 4          # 'hell' -> predict 'ello'
batch_size = 1              # one sample
 
# RNN Model
rnn_cell = tf.nn.rnn_cell.BasicRNNCell(rnn_size)
state = tf.zeros([batch_size, rnn_cell.state_size])
X_split = tf.split(0, time_step_size, X_train)
 
outputs, state = tf.nn.rnn(rnn_cell, X_split, state)
 
logits = tf.reshape(tf.concat(1, outputs), [-1, rnn_size])
targets = tf.reshape(sample[1:], [-1])
weights = tf.ones([len(char_dic) * batch_size])
 
loss = tf.nn.seq2seq.sequence_loss_by_example([logits], [y_train], [weights])
cost = tf.reduce_sum(loss) / batch_size
train_op = tf.train.RMSPropOptimizer(0.01, 0.9).minimize(cost)
 
# Launch the graph in a session
with tf.Session() as sess:
    tf.initialize_all_variables().run()
    for i in range(100):
        sess.run(train_op)
        result = sess.run(tf.argmax(logits, 1))
        print(result, [char_rdic[t] for t in result])


TypeError: Expected binary or unicode string, got [1, 199, 9, 83, 93, 199]

In [15]:
class RNNNumpy:
    
    def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
        self.word_dim = word_dim
        self.hidden_dim = hidden_dim
        self.bptt_truncate = bptt_truncate
        self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
        self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
        self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))

In [16]:
def forward_propagation(self, x):

    T = len(x)
    s = np.zeros((T + 1, self.hidden_dim))
    s[-1] = np.zeros(self.hidden_dim)
    o = np.zeros((T, self.word_dim))
    for t in np.arange(T):
        s[t] = np.tanh(self.U[:,x[t]] + self.W.dot(s[t-1]))
        o[t] = softmax(self.V.dot(s[t]))
    return [o, s]

RNNNumpy.forward_propagation = forward_propagation

In [17]:
def predict(self, x):
   
    o, s = self.forward_propagation(x)
    return np.argmax(o, axis=1)

RNNNumpy.predict = predict

In [51]:
np.random.seed(100)
model = RNNNumpy(vocabulary_size)
for i in range(100):
    o, s = model.forward_propagation(X_train[i])
print (o.shape)
print (o)

(11, 200)
[[ 0.0049406   0.00502571  0.00477976 ...,  0.00480327  0.00486682
   0.00504778]
 [ 0.00506603  0.0051568   0.00482425 ...,  0.00515974  0.00489148
   0.00509859]
 [ 0.00475986  0.00519166  0.00482856 ...,  0.00512463  0.00515253
   0.00509276]
 ..., 
 [ 0.00501935  0.00491588  0.00504467 ...,  0.00497213  0.00489714
   0.00463941]
 [ 0.00472413  0.00520307  0.00506792 ...,  0.00511939  0.00497861
   0.00523462]
 [ 0.00473993  0.00529811  0.00478505 ...,  0.0050879   0.00514217
   0.00506205]]


In [None]:
np.random.seed(10)
model2 = RNNNumpy(vocabulary_size)
o2, s2 = model2.forward_propagation(X_train[10])
print (o2.shape)
print (o2)

In [38]:
predictions = model.predict(X_train[10])
print (predictions.shape)
print (predictions)

(4,)
[ 32  49 158  43]


In [20]:
def calculate_total_loss(self, x, y):
    L = 0
    # For each sentence...
    for i in np.arange(len(y)):
        o, s = self.forward_propagation(x[i])
        # We only care about our prediction of the "correct" words
        correct_word_predictions = o[np.arange(len(y[i])), y[i]]
        # Add to the loss based on how off we were
        L += -1 * np.sum(np.log(correct_word_predictions))
    return L

def calculate_loss(self, x, y):
    # Divide the total loss by the number of training examples
    N = np.sum((len(y_i) for y_i in y))
    return self.calculate_total_loss(x,y)/N

RNNNumpy.calculate_total_loss = calculate_total_loss
RNNNumpy.calculate_loss = calculate_loss

In [21]:
# Limit to 1000 examples to save time
print ("Expected Loss for random predictions: %f" % np.log(vocabulary_size))
print ("Actual loss: %f" % model.calculate_loss(X_train[:1000], y_train[:1000]))

Expected Loss for random predictions: 5.298317
Actual loss: 5.304246


In [22]:
def bptt(self, x, y):
    T = len(y)
    # Perform forward propagation
    o, s = self.forward_propagation(x)
    # We accumulate the gradients in these variables
    dLdU = np.zeros(self.U.shape)
    dLdV = np.zeros(self.V.shape)
    dLdW = np.zeros(self.W.shape)
    delta_o = o
    delta_o[np.arange(len(y)), y] -= 1.
    # For each output backwards...
    for t in np.arange(T)[::-1]:
        dLdV += np.outer(delta_o[t], s[t].T)
        # Initial delta calculation
        delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t] ** 2))
        # Backpropagation through time (for at most self.bptt_truncate steps)
        for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
            # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
            dLdW += np.outer(delta_t, s[bptt_step-1])              
            dLdU[:,x[bptt_step]] += delta_t
            # Update delta for next step
            delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step-1] ** 2)
    return [dLdU, dLdV, dLdW]

RNNNumpy.bptt = bptt

In [23]:
def gradient_check(self, x, y, h=0.001, error_threshold=0.01):
    bptt_gradients = model.bptt(x, y)
    model_parameters = ['U', 'V', 'W']
    for pidx, pname in enumerate(model_parameters):
        parameter = operator.attrgetter(pname)(self)
        print ("Performing gradient check for parameter %s with size %d." % (pname, np.prod(parameter.shape)))
        it = np.nditer(parameter, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            ix = it.multi_index
            original_value = parameter[ix]
            parameter[ix] = original_value + h
            gradplus = model.calculate_total_loss([x],[y])
            parameter[ix] = original_value - h
            gradminus = model.calculate_total_loss([x],[y])
            estimated_gradient = (gradplus - gradminus)/(2*h)
            parameter[ix] = original_value
            backprop_gradient = bptt_gradients[pidx][ix]
            relative_error = np.abs(backprop_gradient - estimated_gradient)/(np.abs(backprop_gradient) + np.abs(estimated_gradient))
            if relative_error > error_threshold:
                print( "Gradient Check ERROR: parameter=%s ix=%s" % (pname, ix))
                print( "+h Loss: %f" % gradplus)
                print( "-h Loss: %f" % gradminus)
                print( "Estimated_gradient: %f" % estimated_gradient)
                print( "Backpropagation gradient: %f" % backprop_gradient)
                print( "Relative Error: %f" % relative_error)
                return 
            it.iternext()
        print ("Gradient check for parameter %s passed." % (pname))

RNNNumpy.gradient_check = gradient_check

grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 1000.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 1000.




Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [24]:
def numpy_sdg_step(self, x, y, learning_rate):
    
    dLdU, dLdV, dLdW = self.bptt(x, y)
    self.U -= learning_rate * dLdU
    self.V -= learning_rate * dLdV
    self.W -= learning_rate * dLdW

RNNNumpy.sgd_step = numpy_sdg_step

In [25]:
def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5):
    
    losses = []
    num_examples_seen = 0
    for epoch in range(nepoch):
        if (epoch % evaluate_loss_after == 0):
            loss = model.calculate_loss(X_train, y_train)
            losses.append((num_examples_seen, loss))
            time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print ("%s: Loss after num_examples_seen=%d epoch=%d: %f" % (time, num_examples_seen, epoch, loss))
            if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                learning_rate = learning_rate * 0.5  
                print( "Setting learning rate to %f" % learning_rate)
            sys.stdout.flush()
        for i in range(len(y_train)):
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1

In [26]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

The slowest run took 11.16 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 823 µs per loop


In [27]:
np.random.seed(10)
model = RNNNumpy(vocabulary_size)
losses = train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)

2017-01-17 20:03:43: Loss after num_examples_seen=0 epoch=0: 5.304308
2017-01-17 20:03:43: Loss after num_examples_seen=100 epoch=1: 3.435799
2017-01-17 20:03:43: Loss after num_examples_seen=200 epoch=2: 3.123769
2017-01-17 20:03:43: Loss after num_examples_seen=300 epoch=3: 3.012231
2017-01-17 20:03:44: Loss after num_examples_seen=400 epoch=4: 2.954271
2017-01-17 20:03:44: Loss after num_examples_seen=500 epoch=5: 2.927109
2017-01-17 20:03:44: Loss after num_examples_seen=600 epoch=6: 2.912807
2017-01-17 20:03:45: Loss after num_examples_seen=700 epoch=7: 2.902205
2017-01-17 20:03:45: Loss after num_examples_seen=800 epoch=8: 2.892634
2017-01-17 20:03:45: Loss after num_examples_seen=900 epoch=9: 2.883534


In [28]:
from rnn_theano import RNNTheano, gradient_check_theano

In [29]:
np.random.seed(10)
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

Performing gradient check for parameter U with size 50.
Gradient check for parameter U passed.
Performing gradient check for parameter V with size 50.
Gradient check for parameter V passed.
Performing gradient check for parameter W with size 100.
Gradient check for parameter W passed.


In [30]:
np.random.seed(10)
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

The slowest run took 4.21 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 819 µs per loop


In [31]:
from utils import load_model_parameters_theano, save_model_parameters_theano

model = RNNTheano(vocabulary_size, hidden_dim=50)
losses = train_with_sgd(model, X_train, y_train, nepoch=50)
save_model_parameters_theano('./data/trained-model-sion_consider.npz', model)
load_model_parameters_theano('./data/trained-model-sion_consider.npz', model)

2017-01-17 20:04:10: Loss after num_examples_seen=0 epoch=0: 5.304559
2017-01-17 20:04:14: Loss after num_examples_seen=5000 epoch=5: 3.188571
2017-01-17 20:04:19: Loss after num_examples_seen=10000 epoch=10: 3.176797
2017-01-17 20:04:24: Loss after num_examples_seen=15000 epoch=15: 3.177750
Setting learning rate to 0.002500
2017-01-17 20:04:28: Loss after num_examples_seen=20000 epoch=20: 3.175461
2017-01-17 20:04:33: Loss after num_examples_seen=25000 epoch=25: 3.170656
2017-01-17 20:04:38: Loss after num_examples_seen=30000 epoch=30: 3.164206
2017-01-17 20:04:43: Loss after num_examples_seen=35000 epoch=35: 3.171127
Setting learning rate to 0.001250
2017-01-17 20:04:47: Loss after num_examples_seen=40000 epoch=40: 3.168836
2017-01-17 20:04:52: Loss after num_examples_seen=45000 epoch=45: 3.175823
Setting learning rate to 0.000625
Saved model parameters to ./data/trained-model-sion_consider.npz.
Loaded model parameters from ./data/trained-model-sion_consider.npz. hidden_dim=50 word_d

In [35]:
def generate_sentence(model):
    new_sentence = [word_to_index[sentence_start_token]]
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index["unknown"]
        while sampled_word == word_to_index["unknown"]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str

num_sentences = 10
senten_min_length = 10

for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print (" ".join(sent))

정말 재밌다 좋아하다 남다 배우 알 멋지다 그 알다 먹다
좀 괜찮다 하다 드라마 명작 점 사람 영화 마지막 짜증 영화
연기 재밌다 영화 이다 아니다 아니다 솔직하다 함 웃음 점 것 보다
좋다 남다 감독 높다 가다 사랑 많다 장면 하다 작품
영화 만들다 한번 영화 사랑 알 안 무슨 보다 좋다
모든 난 설정 봐 이다 따뜻하다 말 멋지다 느낌 하다
감동 좋다 최고 기억 없다 전개 보다 보다 듯 보다
재밌다 가다 되다 보다 정말 짜증 현실 수 점 영화
너무 생각 가다 기억 눈 아깝다 다시 보이다 돈 차다
나오다 먹다 하나 수 마음 보다 하다 수 좋다 싶다 생각 정말 꽤 원작 정말 당시
