# Language model in character/word level 

## Preprocessing of data

In [1]:
import numpy as np
import pandas as pd
import nltk, itertools, re
import itertools, operator
from collections import Counter

# data = open('data/reddit-comments-2015-08.csv', 'r').read()

df = pd.read_csv('data/reddit-comments-2015-08.csv')

# with open('data/reddit-comments-2015-08.csv', 'r') as f:
#     reader = 

df.head()

Unnamed: 0,body
0,I joined a new league this year and they have ...
1,"In your scenario, a person could just not run ..."
2,They don't get paid for how much time you spen...
3,"I dunno, back before the August update in an A..."
4,"No, but Toriyama sometimes would draw himself ..."


In [2]:

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()



In [3]:
# X_train = df.body.str.lower()
X_train = df.body.map(lambda x: clean_str(x)).head(100)
X_train.head()

0    i joined a new league this year and they have ...
1    in your scenario , a person could just not run...
2    they do n't get paid for how much time you spe...
3    i dunno , back before the august update in an ...
4    no , but toriyama sometimes would draw himself...
Name: body, dtype: object

In [4]:
#if read from a plain text file
data = open('data/text.txt', 'r').read()
data


'I joined a new league this year and they have\n'

#### Char-level data processing

In [5]:
char_data = [clean_str(s.strip()) for s in df.body]
char_data = '.'.join(char_data)
char_data
# #Consider char-level
# char_list = "abcdefghijklmnopqrstuvwxyz0123456789 ,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"

chars = list(set(char_data))
chars=sorted(chars)
print(chars)
print("Data has %d characters in which %d are unique" %(len(char_data), len(chars)))


[' ', '!', "'", '(', ')', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', '\\', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Data has 7500269 characters in which 46 are unique


In [6]:
# create char-to-index, index-to-char lists
unknown_char = '§'
index_to_char = [unknown_char] +[x[0] for x in chars] 
char_to_index = {c:i for i, c in enumerate(index_to_char)}  
print(char_to_index, "\n", index_to_char)
 
#find a maximum length of a row in the X_train
#this will be used in the case "considering each row of data as a 'full' sentence
lenchar_of_row=X_train.map(len).max()
print(lenchar_of_row)

{'§': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '.': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, '?': 18, '\\': 19, '`': 20, 'a': 21, 'b': 22, 'c': 23, 'd': 24, 'e': 25, 'f': 26, 'g': 27, 'h': 28, 'i': 29, 'j': 30, 'k': 31, 'l': 32, 'm': 33, 'n': 34, 'o': 35, 'p': 36, 'q': 37, 'r': 38, 's': 39, 't': 40, 'u': 41, 'v': 42, 'w': 43, 'x': 44, 'y': 45, 'z': 46} 
 ['§', ' ', '!', "'", '(', ')', ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', '\\', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
2361


In [7]:
######Convert text into integer ######
# Method 1: consider chars of each sentence differently

ss = X_train.str.pad(lenchar_of_row, side='right', fillchar=unknown_char).\
        map(lambda st: [char_to_index[c] for c in st])
# lenchar_of_row=X_trailencharn_char.map(len).max()
# X_train_char = X_train_char.as_matrix()
# print(len(X_train_char[33]))

print(ss.head())
# ss = pd.Series(data = ss)
X_train_char = np.matrix(ss.tolist())
print(type(X_train_char), X_train_char.shape)

0    [29, 1, 30, 35, 29, 34, 25, 24, 1, 21, 1, 34, ...
1    [29, 34, 1, 45, 35, 41, 38, 1, 39, 23, 25, 34,...
2    [40, 28, 25, 45, 1, 24, 35, 1, 34, 3, 40, 1, 2...
3    [29, 1, 24, 41, 34, 34, 35, 1, 6, 1, 22, 21, 2...
4    [34, 35, 1, 6, 1, 22, 41, 40, 1, 40, 35, 38, 2...
Name: body, dtype: object
<class 'numpy.matrixlib.defmatrix.matrix'> (100, 2361)


In [8]:
print(type(ss))

<class 'pandas.core.series.Series'>


In [9]:
#Just reference to see the speed of finding max
%timeit -n 100 df.body.str.len().max()
%timeit -n 100 df.body.map(lambda x: len(x)).max()
%timeit -n 100 df.body.map(len).max()


5.85 ms ± 463 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
7.23 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.05 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Word-level data processing

In [10]:
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

#Consider word-level
# word_data = ["%s %s %s" %(sentence_start_token, s, sentence_end_token) for s in df.body]
# word_data[0]

def getTokenizedSentences(lines, min_sent_characters=1):
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x) for x in lines])
    sentences = [clean_str(s) for s in sentences if "http" not in s and len(s) >= min_sent_characters]
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
    return sentences

sents = getTokenizedSentences(X_train)
print(sents[10])


SENTENCE_START no , but toriyama sometimes would draw himself as a little robot shen was a funny character for a few episodes \( hitting yamcha in the junk \) before you find out his true identity then he has an awesome fight with piccolo SENTENCE_END


In [11]:
vocabulary_size = 8000

def buildVocab(sentences):
    # Tokenize the sentences into words
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

    ####### If we want to order the words in alphabet #######
    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print("Found %d unique words tokens." % len(word_freq.items()))

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = sorted(word_freq.items(), key=lambda x: (x[1], x[0]), reverse=True)[:vocabulary_size-2]
    print("Using vocabulary size %d." % vocabulary_size)
    print("The most frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[0][0], vocab[0][1]))
    print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))
 
    sorted_vocab = sorted(vocab, key=operator.itemgetter(1))
    index_to_word = ["<MASK/>", unknown_token] + [x[0] for x in sorted_vocab]
    word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])
    
    
    print(word_to_index)
    return index_to_word, word_to_index
    
index_to_word, word_to_index = buildVocab(sents)

Found 2126 unique words tokens.
Using vocabulary size 8000.
The most frequent word in our vocabulary is 'the' and appeared 304 times.
The least frequent word in our vocabulary is ''looking' and appeared 1 times.


In [12]:


####### In case use all sentences of each element in the df.body ######
# Replace all words not in our vocabulary with the unknown token
tokenized_sentences = []
for i, sent in enumerate(sents):
    temp = [word_to_index[w] if w in word_to_index else word_to_index[unknown_token] for w in sent.split()]
    tokenized_sentences.append(temp)

# #convert text to integers
# tokenized_sents = [[word_to_index[c] if c in word_to_index else word_to_index[unknown_token] \
#                                        for c in s] for s in sents.split()]
print(tokenized_sentences[3], sents[3])
X_train_word = tokenized_sentences
len(X_train_word)

[2117, 2116, 2059, 2115, 1633, 2121, 1, 2118] SENTENCE_START that does n't convince you \? SENTENCE_END


118

In [13]:
# Convert training data's text to integer

####### In case considering each row in df.body is a sentence ######

lenword_of_row = X_train.str.split().str.len().max() 

print("maxlen(sentence)=",lenword_of_row)

def manipulateSentence(s, sequence_length, min_sent_characters=1):
    if ("http" not in s and len(s) >= min_sent_characters):
         s = clean_str(s)
    
    #Pad special words. Note: Adding <START/END> is not obligation here as 1 row is 1 sentence
    num_padding = sequence_length - len(s.split())
    if num_padding >0:
        temp = str([unknown_token] * num_padding)
        s = ' '.join([s, temp])

    # Append SENTENCE_START and SENTENCE_END
    s = " ".join([sentence_start_token, s, sentence_end_token])
    
    #convert word to integer
    s = [word_to_index[c] if c in word_to_index else word_to_index[unknown_token] \
                                for c in s.split()]
    if (len(s)!=sequence_length+2): print(len(s), sequence_length)
    return s

ss = X_train.map(lambda s: manipulateSentence(s, lenword_of_row))
print(ss.head())

lenword_of_row+=2
# ss = X_train.map(lambda x: [word_to_index[c] if c in word_to_index else word_to_index[unknown_token] \
#                                        for c in x.split()] )



maxlen(sentence)= 463
0    [2117, 2122, 742, 2123, 1921, 1531, 2095, 1703...
1    [2117, 2113, 2078, 336, 2126, 2123, 1490, 2017...
2    [2117, 2106, 2109, 2115, 2081, 1495, 2110, 201...
3    [2117, 2122, 1609, 2126, 1829, 2026, 2127, 126...
4    [2117, 2071, 2126, 2103, 154, 1428, 2077, 1612...
Name: body, dtype: object


In [14]:
x_temp = ss.values #[[5,2],[1,2,3],[1]]
length = len(sorted(x_temp,key=len, reverse=True)[0])
X_train_word = np.array([xi+[None]*(length-len(xi)) for xi in x_temp])
X_train_word.shape

(100, 465)

## Char-level Model

In [15]:
hidden_size = 200 # nb of neurons in hidden layer
seq_length = 25 # number of steps to unroll the RNN for, and this is also nb of chars putting in the input of RNN
learning_rate = 1e-1 # learning rate of training

vocab_size = len(chars) + 1 #M
wordvec_size = vocab_size #D

#init parameters
W_hh = np.random.randn(hidden_size, hidden_size) #(H,H)
W_xh = np.random.randn(wordvec_size, hidden_size) #(D,H)
W_hy = np.random.randn(hidden_size, vocab_size) #(H,M), M=D for mode='char'
b_h = np.zeros((1,hidden_size)) #(H,)
b_y = np.zeros((1,vocab_size)) #(M,)


## Layers

In [16]:
#Reference from @Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086
#
#                         [b_h]                                                [b_y]
#    w2v                    v                                (h_next)            v
#  x --> x_s -> [W_xh] -> [sum] -> h_raw -> [nonlinearity] -> h_s -> [W_hy] -> [sum] -> y_s -> [exp(y[k])/sum(exp(y))] -> p_s
#                           ^                                  |
#                           '----h_prev------[W_hh]------------'
#


### RNN Forward

In [17]:
def rnn_forward_single_step(x, h_prev, W_xh, W_hh, W_hy, b_h, b_y):
    """
    Run the forward pass for a single timestep of a vanilla RNN that uses a tanh
    activation function.

    The input data has dimension D, the hidden state has dimension H, and we use
    a minibatch size of N.

    Inputs:
    - x: Input data for this timestep, of shape (N, D).
    - h_prev: Hidden state from previous timestep, of shape (N, H)
    - W_xh: Weight matrix for input-to-hidden connections, of shape (D, H)
    - W_hh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
    - W_hy: Weight matrix for hidden-to-output connections, of shape (H, M)
    - b_h: Biases of shape (H,)
    - b_y: Bias of shape (M, )

    Returns a tuple of:
    - h_next: Next hidden state, of shape (N, H)
    - y_s: output of this timestep (N, M)
    - cache: Tuple of values needed for the backward pass.
    """
    
    h_raw = np.dot(x, W_xh) + np.dot(h_prev, W_hh) + b_h  #(N,D)x(D,H) + (N,H)x(H,H) +(1,H) = (N,H)
    h_next = np.tanh(h_raw) # hidden nodes, (N, H)
    y_s = np.dot(h_next, W_hy) + b_y #output, (N, H)x(H,M) +(1,M) =(N,M)
#     p_s = np.exp(y_s) / np.sum(np.exp(y_s), axis=0) #softmax
    
    cache = (x, h_prev, h_next, W_xh, W_hh, W_hy, b_h, b_y)
    return h_next, y_s, cache


def rnn_forward(x, h0, W_xh, W_hh, W_hy, b_h, b_y):
    """
    Run a vanilla RNN forward on an entire sequence of data. We assume an input
    sequence composed of T vectors (each vector represents a word/char), each of dimension D. 
    The RNN uses a hidden
    size of H, and we work over a minibatch containing N sequences. After running
    the RNN forward, we return the hidden states for all timesteps.
    Inputs:
    - x: Input data for the entire timeseries, of shape (N, T, D).
    - h0: Initial hidden state, of shape (N, H)
    - W_xh: Weight matrix for input-to-hidden connections, of shape (D, H)
    - W_hh: Weight matrix for hidden-to-hidden connections, of shape (H, H)
    - W_hy: Weight matrix for hidden-to-output connections, of shape (H, M)
    - b_h: Biases of shape (H,)
    - b_y: Bias of shape (M, )
    Returns a tuple of:
    - h: Hidden states for the entire timeseries, of shape (N, T, H)
    - y: Output of the entire timeseries, of shape (N, T, M)
    - cache: Values needed in the backward pass
    """
    N, T, D = x.shape
    H, M = W_hy.shape
    
    h = np.empty((N, T, H))
    cache = {}
    y_s = np.empty((N, T, M))
    
#     print("N=%d, T=%d, D=%d, H=%d, M=%d" %(N, T,D,H,M))
    for i in range(T):
        if i==0: 
            h[:, i, :], y_s[:, i, :], cache[i] = rnn_forward_single_step(x[:,i,:], h0, W_xh, W_hh, W_hy, b_h, b_y)
        else: 
            h[:, i, :], y_s[:, i, :], cache[i] = rnn_forward_single_step(x[:,i,:], h[:, i-1, :], W_xh, W_hh, W_hy, b_h, b_y)
    
    return h, y_s, cache




### RNN Backward

In [18]:
#Reference from @Karpathy: https://gist.github.com/karpathy/d4dee566867f8291f086
#
#                         [b_h] (1,H)                                         [b_y] (1,M)
#    w2v                    v      (N,H)                     (N,H)   (H,M)       v
#  x --> x_s -> [W_xh] -> [sum] -> h_raw -> [nonlinearity] -> h_s -> [W_hy] -> [sum] -> y_s -> [exp(y[k])/sum(exp(y))] -> p_s
# (N,D)         (D,H)       ^                                  |                       (N,M)
#                           '----h_prev------[W_hh]------------'
#                                (N,H)       (H,H)

def rnn_backward_single_step (dh_next, dy, cache):
    """
    Backward pass for a single timestep of a vanilla RNN.
    Inputs:
    - dh_next: Gradient of loss with respect to next hidden state (N, H)
    - dy: of shape (N, M)
    - cache: Cache object from the forward pass
    Returns a tuple of:
    - dx: Gradients of input data, of shape (N, D)
    - dh_prev: Gradients of previous hidden state, of shape (N, H)
    - dWxh: Gradients of input-to-hidden weights, of shape (D, H)
    - dWhh: Gradients of hidden-to-hidden weights, of shape (H, H)
    - dWhy: Gradients of hidden-to-output weights, of shape (H, M)
    - dbh: Gradients of bias vector, of shape (H,)
    - dby: Gradients of bias vector, of shape (M,)
    
    """
    dx, dh_prev, dWxh, dWhh, dWhy, dbh, dby = None, None, None, None, None, None, None
    x, h_prev, h_next, W_xh, W_hh, W_hy, b_h, b_y = cache
    
    dby = np.sum(dy, axis=0) # backprop of softmax, shape=(1,M)
    dWhy = np.dot(h_next.T, dy) #(H,N)x(N,M) = (H,M)
    dh = np.dot(dy, W_hy.T) + dh_next # backprop into h, (N,M)x(M,H)=(N,H)

    dh_raw = (1 - h_next ** 2) * dh # note: tanh(x)' = 1 - tanh^2(x), shape=(N,H)
    dbh = np.sum(dh_raw, axis=0) #(1,H)
    dWxh = np.dot(x.T, dh_raw) # (D,N)x(N,H) = (D,H)
    dWhh = np.dot(h_prev.T, dh_raw) # (H,N)x(N,H)=(H,H)
    dx = np.dot(dh_raw, W_xh.T) # (N,H)x(H,D) = (N,D)
    dh_prev = np.dot(dh_raw, W_hh.T) # (N,H)x(H,H) = (N,H)

    ########
#     dpre_actv = (1 - next_h ** 2) * dnext_h         # (N, H)
#     dx = dpre_actv.dot(Wx.T)
#     dprev_h = dpre_actv.dot(Wh.T) #(N,H)x(H,H)=(N,H)
#     dWx = x.T.dot(dpre_actv)
#     dWh = prev_h.T.dot(dpre_actv)
#     db = np.sum(dpre_actv, 0)
    
    
#     dWhy += np.dot(dy, hs[t].T)
#     dby += dy
#     dh = np.dot(Why.T, dy) + dhnext # backprop into h
#     dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
#     dbh += dhraw
#     dWxh += np.dot(dhraw, xs[t].T)
#     dWhh += np.dot(dhraw, hs[t-1].T)
#     dhnext = np.dot(Whh.T, dhraw)

    return dx, dh_prev, dWxh, dWhh, dWhy, dbh, dby


def rnn_backward(dy, cache):
    """
    Compute the backward pass for a vanilla RNN over an entire sequence of data.
    Inputs:
    - dy: Upstream gradients of softmax output, of shape (N, T, M)
    - cache: cache objects from softmax output  
    Returns a tuple of:
    - dx: Gradient of inputs, of shape (N, T, D)
    - dh0: Gradient of initial hidden state, of shape (N, H)
    - dWx: Gradient of input-to-hidden weights, of shape (D, H)
    - dWh: Gradient of hidden-to-hidden weights, of shape (H, H)
    - dbh: Gradient of biases, of shape (H,)
    - dby: Gradient of biases, of shape (M,)
    """
    
   ## Calculate the upstreams gradients of all hidden states (dh) from softmax results
    #i.e, temporal affine softmax backward
#     Wx, Wh, b, x, prev_h, next_h = cache[0]
#     N, T, H = dh.shape
#     D, H = Wx.shape

#     # Initialise gradients.
#     dx = np.zeros([N, T, D])
#     dWx = np.zeros_like(Wx)
#     dWh = np.zeros_like(Wh)
#     db = np.zeros_like(b)
# dprev_h = np.zeros_like(prev_h)


    N, T, M = dy.shape
    D, H = wordvec_size, hidden_size

    
    dx = np.empty((N,T,D))
    dWxh = np.zeros((D,H))
    dWhh = np.zeros((H,H))
    dWhy = np.zeros((H,M))
    dbh = np.zeros((1,H))
    dby = np.zeros((1,M))    
    
    dh_next = np.zeros((N, H))
    
    for i in reversed(range(T)):
#         print("i=",i, "dh_next=", dh_next.shape, "dy=", dy[:,i,:].shape)
        dx[:,i,:], dh_next, dWxh_temp, dWhh_temp, dWhy_temp, dbh_temp, dby_temp = \
                rnn_backward_single_step(dh_next, dy[:,i,:], cache[i])
        
        dWxh += dWxh_temp
        dWhy += dWhy_temp
        dWhh += dWhh_temp
        dbh += dbh_temp
        dby += dby_temp
    
    dh0 = dh_next
    return dx, dh0, dWxh, dWhh, dWhy, dbh, dby


### Affine forward/backward

In [19]:
## Affine forward
def affine_forward(x, W, b):
    """
    Inputs:
    x - Input data, of shape (N, D) #may be generally (N,d1,...d_k) 
    w - Weights, of shape (D, M)
    b - Biases, of shape (M,)

    Returns a tuple of:
    - out: output, of shape (N, M)
    - cache: (x, w, b)
    """
    
    out = np.dot(np.reshape(x,(N,-1)), W) + b #out = xW+b
    cache = (x, W, b)
    return out, cache

## Affine backward
def affine_backward(dout, cache):
    """
    Inputs:
    - dout: Upstream derivative, of shape (N, M)
    - cache: Tuple of:
      - x: Input data, of shape (N, D)
      - w: Weights, of shape (D, M)

    Returns a tuple of:
    - dx: Gradient with respect to x, of shape (N, D)
    - dw: Gradient with respect to w, of shape (D, M)
    - db: Gradient with respect to b, of shape (M,)
    """    
    x, W, b = cache
    db = np.sum(dout, axis=0)
    dx = dout.dot(W.T).reshape(x.shape)
    dw = x.reshape(x.shape[0], -1).T.dot(dout)
    

### Softmax

In [None]:
def temporal_softmax_loss (y_s, y, mask, verbose=False):
    """
    A temporal version of softmax loss for use in RNNs. We assume that we are
    making predictions over a vocabulary of size M for each timestep of a
    timeseries of length T, over a minibatch of size N. The input y_s gives SCORES
    for all vocabulary elements at all timesteps, and y gives the INDICES of the
    ground-truth element at each timestep. We use a cross-entropy loss at each
    timestep, summing the loss over all timesteps and averaging across the minibatch.
    
    As an additional complication, we may want to ignore the model output at some
    timesteps, since sequences of different length may have been combined into a
    minibatch and padded with NULL tokens. The optional mask argument tells us
    which elements should contribute to the loss.
    
    Inputs:
    - y_s: Input scores, of shape (N, T, M)
    - y: Ground-truth indices, of shape (N, T) where each element is in the range
         0 <= y[i, t] < M
    - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
      the scores at y_s[i, t] should contribute to the loss.
    Returns a tuple of:
    - loss: Scalar giving loss
    - dy_s: Gradient of loss with respect to scores y_s, of shape (N, T, M)
    """
    
    N, T, M = y_s.shape

    ys_flat = y_s.reshape(N * T, M)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)

#     print("mask, mask_flat shapes = ", mask.shape, mask_flat.shape)
#     print("ys, ys_flat shapes = ", y_s.shape, ys_flat.shape)
#     print("y, y_flat shapes = ", y.shape, y_flat.shape)
    
    probs = np.exp(ys_flat - np.max(ys_flat, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(np.multiply(mask_flat, np.log(probs[np.arange(N * T), y_flat]))) / N
    
    dys_flat = probs.copy()
    dys_flat[np.arange(N * T), y_flat] -= 1
    dys_flat /= N
    
    mask_flat = mask_flat.reshape((-1, 1))
#     print(dys_flat.shape, mask_flat.shape) #(928, 2128), (928,)
    
    dys_flat *= mask_flat
    
    if verbose: print('dys_flat: ', dys_flat.shape)

    dy_s = dys_flat.reshape(N, T, M)

    return loss, dy_s


def softmax_loss(x, y, mask):
    """
    Computes the loss and gradient for softmax classification.

    Inputs:
    - x: Input data, of shape (N, C) where x[i, j] is the score for the jth
      class for the ith input.
    - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and
      0 <= y[i] < C

    Returns a tuple of:
    - loss: Scalar giving the loss
    - dx: Gradient of the loss with respect to x
    """
    shifted_logits = x - np.max(x, axis=1, keepdims=True) #(N,C)
    Z = np.sum(np.exp(shifted_logits), axis=1, keepdims=True) #(N,1)
    log_probs = shifted_logits - np.log(Z) #(N,C), log_probs_i=f_{y_i} + log(\sigma_j e^{f_j})
    probs = np.exp(log_probs) #(N,C)
    N = x.shape[0]
    loss = -np.sum(np.multiply(mask,log_probs[np.arange(N), y])) / N

    dx = probs.copy()
    dx[np.arange(N), y] -= 1
    dx /= N
    print("dx = ", dx, dx.shape)
    return loss, dx



### Word embedding

In [None]:
def word_embedding_forward(x, W):
    """
    Forward pass for word embeddings. We operate on minibatches of size N where
    each sequence has length T. We assume a vocabulary of V words, assigning each
    to a vector of dimension D.
    Inputs:
    - x: Integer array of shape (N, T) giving indices of words. Each element idx
      of x muxt be in the range 0 <= idx < V.
    - W: Weight matrix of shape (V, D) giving word vectors for all words.
    Returns a tuple of:
    - out: Array of shape (N, T, D) giving word vectors for all input words.
    - cache: Values needed for the backward pass
    """
    out = W[x, :] 
    cache = (x, W)
    return out, cache


def word_embedding_backward(dout, cache):
    """
    Backward pass for word embeddings. We cannot back-propagate into the words
    since they are integers, so we only return gradient for the word embedding
    matrix.
    Inputs:
    - dout: Upstream gradients of shape (N, T, D)
    - cache: Values from the forward pass
    Returns:
    - dW: Gradient of word embedding matrix, of shape (V, D).
    """
    
    # x: (N, T)
    # W: (V, D)
    x, W = cache
    N, T, D = dout.shape

    dW = np.zeros_like(W)
    np.add.at(dW, x.reshape([-1]), dout.reshape([-1, D]))
    
    return dW



## Classifier (Model)

In [None]:
#prepare data, we select: 60 train, 20 val, 20 test
mode = 'word'

if (mode=='char'): X_data = X_train_char
else: X_data = X_train_word
# X_train, X_val, X_test = np.split(X_data.sample(frac=1), [int(.6*len(X_data)), int(.8*len(X_data))])

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X_data, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train, test_size=0.25, random_state=42)
X_train.shape, X_val.shape, X_test.shape

In [None]:
#Initialisation
#Here, some vars are repeated to memorize

hidden_size = 200 # H, nb of neurons in hidden layer
seq_length = 25 # T, nb of steps to unroll the RNN for <=> nb of chars putting in the input of RNN


mode = "word"
vocab_size = len(chars)+1 # D
wordvec_size = vocab_size # V, if 'char' then D=V as we don't use wordvec
data_size = X_train.shape[0] #X_train_char.shape[0]

if mode=="word":
#     data_size = X_train_word.shape[0] #len(sents)
    vocab_size = len(word_to_index)
    wordvec_size = 128 # output embedding vector length

#init word/char vector
W_embed = np.random.randn(vocab_size, wordvec_size)
W_embed /=100


#init RNN parameters
W_xh = np.random.randn(wordvec_size, hidden_size) #(V,H) or (D,H)
W_xh /= wordvec_size
W_hh = np.random.randn(hidden_size, hidden_size) #(H,H)
W_hh /= hidden_size
W_hy = np.random.randn(hidden_size, vocab_size) #(H,M)
W_hy /= hidden_size
b_h = np.zeros((1, hidden_size)) #(H,)
b_y = np.zeros((1, vocab_size)) #(M,)

#init a dict of parameters
params = {}
params['W_embed'] = W_embed 

params['W_xh'], params['W_hh'], params['W_hy'], params['b_h'], params['b_y'] =  W_xh, W_hh, W_hy, b_h, b_y

#reg: Scalar giving L2 regularization strength. If None then no reg is used
reg=0.1



def initParameter():

    hidden_size = 200 # H, nb of neurons in hidden layer
    seq_length = 25 # T, nb of steps to unroll the RNN for <=> nb of chars putting in the input of RNN


    mode = "char"
    vocab_size = len(chars)+1 # D
    wordvec_size = vocab_size # V, if 'char' then D=V as we don't use wordvec
    data_size = X_train_char.shape[0]

    if mode=="word":
        data_size = X_train_word.shape[0] #len(sents)
        vocab_size = len(word_to_index)
        wordvec_size = 128 # output embedding vector length

    #init word/char vector
    W_embed = np.random.randn(vocab_size, wordvec_size)
    W_embed /=100


    #init RNN parameters
    W_xh = np.random.randn(wordvec_size, hidden_size) #(V,H) or (D,H)
    W_xh /= wordvec_size
    W_hh = np.random.randn(hidden_size, hidden_size) #(H,H)
    W_hh /= hidden_size
    W_hy = np.random.randn(hidden_size, vocab_size) #(H,M)
    W_hy /= hidden_size
    b_h = np.zeros((1, hidden_size)) #(H,)
    b_y = np.zeros((1, vocab_size)) #(M,)

    #init a dict of parameters
    params = {}
    params['W_embed'] = W_embed 
    params['W_xh'], params['W_hh'], params['W_hy'], params['b_h'], params['b_y'] =  W_xh, W_hh, W_hy, b_h, b_y
    
    #reg: Scalar giving L2 regularization strength. If None then no reg is used
    reg=0.1

### Training loss

## Solver

In [None]:
# parameters of solver
"""
    - update_rule: A string giving the name of an update rule. Default is 'sgd'.
    - opt_config: A dictionary containing hyperparameters that will be
        passed to the chosen update rule. Each update rule requires different
        hyperparameters, but all update rules require a 'learning_rate' parameter.
    - lr_decay: A scalar for learning rate decay; after each epoch the
        learning rate is multiplied by this value.
    - batch_size: Size of minibatches used to compute loss and gradient
        during training.
    - num_epochs: The number of epochs to run for during training.
    - print_every: Integer; training losses will be printed every
        print_every iterations.
    - verbose: Boolean; if set to false then no output will be printed
        during training.
    - num_train_samples: Number of training samples used to check training
        accuracy; default is 1000; set to None to use entire training set.
    - num_val_samples: Number of validation samples to use to check val
        accuracy; default is None, which uses the entire validation set.
    - checkpoint_name: If not None, then save model checkpoints here every
        epoch.
"""

update_rule = 'sgd'
opt_config={'learning_rate': 1e-2}
lr_decay = 1.0
batch_size = 2 # N
num_epochs = 10 # E
print_every = 100
verbose = True
num_train_samples = 1000
num_val_samples = None
checkpoint_name = None


# Make a deep copy of the opt_config for each parameter
optim_configs = {}
for p in params:
    print(p, params[p].shape)
    optim_configs[p] = {k: v for k, v in opt_config.items()}

#### Update rules (Optimization methods)

In [None]:
class Optim:
    """
    This class implements various first-order update rules that are commonly used
    for training neural networks. Each update rule accepts current weights and the
    gradient of the loss with respect to those weights and produces the next set of
    weights. Each update rule has the same interface:

    def update(w, dw, config=None):

    Inputs:
      - w: A numpy array giving the current weights.
      - dw: A numpy array of the same shape as w giving the gradient of the
        loss with respect to w.
      - config: A dictionary containing hyperparameter values such as learning
        rate, momentum, etc. If the update rule requires caching values over many
        iterations, then config will also hold these cached values.

    Returns:
      - next_w: The next point after the update.
      - config: The config dictionary to be passed to the next iteration of the
        update rule.

    NOTE: For most update rules, the default learning rate will probably not
    perform well; however the default values of the other hyperparameters should
    work well for a variety of different problems.

    For efficiency, update rules may perform in-place updates, mutating w and
    setting next_w equal to w.
    """

    def sgd(self, w, dw, config=None):
        """
        Vanilla SGD update rule.

        config format:
        - learning_rate: Scalar learning rate.
        """
        if config is None: config = {}
        config.setdefault('learning_rate', 1e-2)
        next_w = 0
        
        next_w -= config['learning_rate'] * dw
        return next_w, config


    def adam(self, x, dx, config=None):
        """
        Uses the Adam update rule, which incorporates moving averages of both the
        gradient and its square and a bias correction term.

        config format:
        - learning_rate: Scalar learning rate.
        - beta1: Decay rate for moving average of first moment of gradient.
        - beta2: Decay rate for moving average of second moment of gradient.
        - epsilon: Small scalar used for smoothing to avoid dividing by zero.
        - m: Moving average of gradient.
        - v: Moving average of squared gradient.
        - t: Iteration number.
        """
        if config is None: config = {}
        config.setdefault('learning_rate', 1e-3)
        config.setdefault('beta1', 0.9)
        config.setdefault('beta2', 0.999)
        config.setdefault('epsilon', 1e-8)
        config.setdefault('m', np.zeros_like(x))
        config.setdefault('v', np.zeros_like(x))
        config.setdefault('t', 0)

        next_x = None
        beta1, beta2, eps = config['beta1'], config['beta2'], config['epsilon']
        t, m, v = config['t'], config['m'], config['v']
        m = beta1 * m + (1 - beta1) * dx
        v = beta2 * v + (1 - beta2) * (dx * dx)
        t += 1
        alpha = config['learning_rate'] * np.sqrt(1 - beta2 ** t) / (1 - beta1 ** t)
        x -= alpha * (m / (np.sqrt(v) + eps))
        config['t'] = t
        config['m'] = m
        config['v'] = v
        next_x = x

        return next_x, config

optim = Optim()
#define function to update parameters
updateParameter = getattr(optim, update_rule)

In [None]:

def trainingLoss(text, img_fea=None, training=True, mode='char'):
    """
    Compute training-time loss.
    Inputs:
    - img_fea: Image features (used for captioning), of shape (N, D)
    - text: Ground-truth texts (e.g., captions); an integer array of shape (N, T) where
        each element is in the range 0 <= y[i, t] < M
    Returns:
        If training=True, then run a test-time forward pass of the model and return:
        - scores: Array of shape (N, M) giving classification scores, where
          scores[i, c] is the classification score for y[i] and the word/char with index c.

        Else, then run a training-time forward and backward pass and
        return a tuple of:
        - loss: Scalar value giving the loss
        - grads: Dictionary of gradients 
    """
    singleton = False
    if text.ndim == 1:
        singleton = True
        text = text[None]
    
    # Divide 'text' into two pieces 
    text_in = text[:, :-1]
    text_out = text[:, 1:]
#     print(text_in.shape, text_out.shape) # (2, 9518) = N, T
    
    if mode=='word':  mask = (text_out != word_to_index[unknown_token])
    else: mask = (text_out != char_to_index[unknown_char])
    
#     print(mask, mask.shape) #(2, 9518) = N, T


    #prepare parameters
    W_xh = params['W_xh']; W_hh = params['W_hh']; W_hy = params['W_hy']
    b_h = params['b_h']; b_y = params['b_y']
    W_embed = params['W_embed']
    
    
#     print("shapes of parameters: ",W_xh.shape, W_hh.shape, W_hy.shape, b_h.shape, b_y.shape, W_embed.shape)
    
    ##### FORWARD STEPS ######
    embedded_text, cache_word_embedding = None, None
    
    # Embed the input word captions.
    embedded_text, cache_word_embedding = word_embedding_forward(text_in, W_embed)
    
#     print(embedded_text.shape) # (N, T, D) = (2, 9518, 47)

    #RNN forward
    h_prev = np.zeros((text.shape[0], hidden_size)) #(N, H) = (2,200)
    h, y_s, cache_rnn = rnn_forward(embedded_text, h_prev, W_xh, W_hh, W_hy, b_h, b_y)
    
    if training==False:
        return y_s
    
    loss, grads = 0.0, {}

    #Softmax
    loss, dy_s = temporal_softmax_loss(y_s, text_out, mask)
    
    ##### BACKWARD STEPS ######
    
    #Backprop dy_s to get gradients
    dx, dh0, dWxh, dWhh, dWhy, dbh, dby = rnn_backward(dy_s, cache_rnn)
    
    grads['W_xh'], grads['W_hh'], grads['W_hy'], grads['b_h'], grads['b_y'] = dWxh, dWhh, dWhy, dbh, dby

    
    # Backprop dx to get gradient for word embedding weights.
#     if (mode=='word'):
    dW_embed = word_embedding_backward(dx, cache_word_embedding)
    grads['W_embed'] = dW_embed
    

    if (reg!=None):
        for key in ['W_xh', 'W_hh', 'W_hy', 'W_embed']:
            loss += 0.5 * reg * np.linalg.norm(params[key])**2
            grads[key] += reg * np.linalg.norm(params[key])**2
    
    return loss, grads


In [None]:
def decode_captions(captions, mode='word'):
    singleton = False
    if captions.ndim == 1:
        singleton = True
        captions = captions[None]
    decoded = []
    N, T = captions.shape
    
    if mode=="word":
        for i in range(N):
            words = []
            for t in range(T):
                word = index_to_word[captions[i, t]]
                if word != unknown_token:
                    words.append(word)
                if word == sentence_end_token:
                    break
            decoded.append(' '.join(words))
        if singleton:
            decoded = decoded[0]
    else:
        for i in range(N):
            charset = []
            for t in range(T):
                c = index_to_char[captions[i, t]]
                if c != '$':
                    charset.append(c)
                if c == '<END>':
                    break
            decoded.append(' '.join(charset))
        if singleton:
            decoded = decoded[0]
    return decoded

def BLEU_score(gt_sent, sample_sent,mode):
    """
    gt_sent: string, ground-truth caption
    sample_sent: string, your model's predicted caption
    Returns unigram BLEU score.
    """

    reference = []
    for sent in gt_sent:
        for x in sent: 
            if mode=='word':
                if (word_to_index[sentence_start_token]!=x and word_to_index[sentence_end_token]!=x \
                    and word_to_index[unknown_token]!=x): reference.append(x)
            else:
                if (char_to_index[unknown_char]!=x): reference.append(x)
            
    hypothesis = []
    for sent in sample_sent:
        for x in sent: 
            if mode=='word':
                if (word_to_index[sentence_start_token]!=x and word_to_index[sentence_end_token]!=x \
                    and word_to_index[unknown_token]!=x): hypothesis.append(x)
            else:
                if (char_to_index[unknown_char]!=x): hypothesis.append(x)
            
    
#     reference = [x for x in sent 
#                  if (sentence_end_token not in x and sentence_start_token not in x and unknown_token not in x)]
#     hypothesis = [[x for x in sent 
#                   if (sentence_end_token not in x and sentence_start_token not in x and unknown_token not in x)]
#                   for sent in sample_sent]
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis, weights = [1])
    return BLEUscore


def evaluate_model(gt_sents, sample_sents, mode='word'):
    """
    Prints unigram BLEU score averaged over training and val examples.
    """
    total_score = 0.0
    for gt, sample in zip(gt_sents, sample_sents):
        total_score += BLEU_score(gt, sample, mode)

    BLEU_scores = total_score / len(sample_sents)
        
    print('Average BLEU score is ', BLEU_scores)
    return BLEU_scores

In [None]:
def generate_sentence(desired_len, mode='char', first_char=None):
    """
    Generate a sentence (containing words/chars) with a given length
    """
    sentence_str = []
    if(mode=='char'):
        if first_char==None: return None
        print(first_char, char_to_index)
         # We start the sentence with the start token
        new_char = [first_char] # if not yet converted to index, then use: [char_to_index[first_char]]
        # Repeat until we get an end token
        while not new_char[-1] == char_to_index['.'] and len(new_char)<desired_len+1:
            next_char_probs = trainingLoss(np.asarray(new_char), training=False) #get proposed words
            print(next_char_probs)
            sampled_char = char_to_index[unknown_char]
            # We don't want to sample unknown chars
            while sampled_char == char_to_index[unknown_char]:
                samples = np.random.multinomial(1, next_char_probs[-1]) #create a sample with prob is the last word prob
                sampled_char = np.argmax(samples)
            new_char.append(sampled_char)
        sentence_str = [index_to_char[x] for x in new_char]
    else:    
        # We start the sentence with the start token
        new_sentence = [word_to_index[sentence_start_token]]
        # Repeat until we get an end token
        while not new_sentence[-1] == word_to_index[sentence_end_token] and len(new_sentence)<desired_len+1:
            next_word_probs = sample(new_sentence, training=False) #get proposed words
            sampled_word = word_to_index[unknown_token]
            # We don't want to sample unknown words
            while sampled_word == word_to_index[unknown_token]:
                samples = np.random.multinomial(1, next_word_probs[-1]) #create a sample with prob is the last word prob
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    return sentence_str
 
def sample(batch_size, mode='word', first_char=None, max_length=30):
        """
        Run a test-time forward pass for the model, generate text string
        Inputs:
        - batch_size: size of batch 
        - first_char: if considering 'char level', then generate a string based on the first char
        - max_length: maximum length T of generated text.
        Returns:
        - text: Array of shape (N, max_length) giving sampled string,
          where each element is an integer in the range [0, V). The first element
          of text should be the first sampled word, not the <START> token.
        """
        
        N=batch_size
        
        if (mode=='char'):   text = char_to_index[unknown_char] * np.ones((N, max_length), dtype=np.int32)
        else: text = word_to_index[unknown_token] * np.ones((N, max_length), dtype=np.int32)
#         print("text.shape=", text.shape) # (N,T) = (2, 10)
        
        # Unpack parameters
        W_embed = params['W_embed'] # (V,D)=(47,47)
        W_xh, W_hh, W_hy, b_h, b_y = params['W_xh'], params['W_hh'], params['W_hy'], params['b_h'], params['b_y']

        H = W_xh.shape[1]
        cur_hidden_state = np.zeros((N,H))
        
        print(W_embed.shape, W_xh.shape, W_hh.shape, W_hy.shape, b_h.shape, b_y.shape, cur_hidden_state.shape)
        #(47, 47) (47, 200) (200, 200) (200, 47) (1, 200) (1, 47) (47, 200)
        
        # Embed our start token, will broadcast to size N.
        if (mode=='char'):
            word_embed, _ = word_embedding_forward(first_char, W_embed)
        else: word_embed, _ = word_embedding_forward(word_to_index[sentence_start_token], W_embed)
        print(word_embed.shape) #(2, 1, 47)
        
        word_embed = np.reshape(word_embed,(-1, 1)).T
        
        # Sample max_length number of words.
        for i in range(max_length):
            cur_hidden_state, cur_scores, _ = rnn_forward_single_step(word_embed, cur_hidden_state, W_xh, W_hh, W_hy, b_h, b_y)
#             print(cur_scores.shape) # (2,47)
            # Find the highest value index and assign it to the correct place in text.
            text[:,i] = np.argmax(cur_scores, axis=1)
#             print("i=",i, "text[i]=", text[:,i])
                
            # Embed the word produced for the next iteration.
            word_embed, _ = word_embedding_forward(text[:, i], W_embed)

#         print(text, type(text)) 
        return text

def check_accuracy(X, num_samples=None, batch_size=2):
    """
    Check accuracy of the model on the provided data.

    Inputs:
    - X: Array of data, of shape (N, D)
    - num_samples: If not None, subsample the data and only test the model
        on num_samples datapoints.
    - batch_size: Split X and y into batches of this size to avoid using
        too much memory.

    Returns:
    - acc: Scalar giving the fraction of instances that were correctly
          classified by the model.
    """

    # subsample the data
    N = X.shape[0]
    if num_samples is not None and N > num_samples:
        mask = np.random.choice(N, num_samples)
        N = num_samples
        X = X[mask] 
        

    # Compute predictions in batches
    num_batches = N // batch_size
    if N % batch_size != 0:
        num_batches += 1
    
    y_pred = []
    y=[] #this is a special case when considering LM
    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        y.append(X[start:end])
        max_length=10 # T=10
        if (mode=='char'):  new_sent_char = sample(batch_size, mode='char', first_char=X[start:end,0], max_length=max_length)
        else: new_sent_char = sample(batch_size, mode='word', max_length=max_length)
        y_pred.append(new_sent_char)
    
    print(len(y), len(y_pred)) #30, 30
#     y_pred = np.hstack(y_pred) #put into stack horizontally
#     acc = np.mean(y_pred == y)
    BLUE_score = evaluate_model(y, y_pred)
    print(BLUE_score)
    return BLUE_score

In [77]:
loss_history = [] # a list for storing history of training loss
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0

# Set up some variables for storing info of training
epoch = 0
best_val_acc = 0
best_params = {}
loss_history = []
train_acc_history = []
val_acc_history = []

def executeStep():
    """
    Execute one iteration of training
    """
    #sample data
    N = X_train.shape[0]
    batch_mask = np.random.choice(N, batch_size)
    

    #convert text into interger arrays
    if (mode=='char'): 
        X_batch = X_train_char[batch_mask]
    else: X_batch = X_train_word[batch_mask]
    
    
    #compute loss for this batch
    loss, grads = trainingLoss(X_batch)
#     smooth_loss = smooth_loss * 0.999 + loss * 0.001
    
    #add to history
    loss_history.append(loss)

    #update classifier (model) parameters
    for w_name, old_w in params.items():
        #get the gradient
        dw = grads[w_name]
        #get the updates
        
        next_w, new_config = updateParameter(old_w, dw, optim_configs[w_name]) #old value, old gradient, config
        
        #re-assign to params and opt_config
        params[w_name] = next_w
        optim_configs[w_name] = new_config
        
#         print("parameter ", w_name, ": oldval=", old_w.shape, "newval=", next_w.shape)
        
#     return smooth_loss
    
def train():
    """
    Train the model
    """
    num_train = data_size 
    iterations_per_epoch = max (num_train//batch_size, 1)
    num_iterations = num_epochs * iterations_per_epoch
    smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
    best_val_acc = 0
    epoch=0
    best_params = {}

    
    initParameter()
    for t in range(num_iterations):
            #Execute one step of training
#             print(smooth_loss)
#             smooth_loss = executeStep(smooth_loss)
            executeStep()
            # print loss for debug 
            if verbose and t % print_every == 0:
                print('(Iteration %d / %d) loss: %f' % (
                       t + 1, num_iterations, loss_history[-1]))

            # Increment the epoch counter and decay the learning rate.
            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                epoch += 1
                for k in optim_configs:
                    optim_configs[k]['learning_rate'] *= lr_decay

            # Check train and val accuracy on the first iteration, the last
            # iteration, and at the end of each epoch.
            if (t==0 or t==num_iterations-1 or epoch_end):
                train_acc = check_accuracy(X_train, num_samples=num_train_samples)
                train_acc_history.append(train_acc)
                
                val_acc = check_accuracy(X_val, num_samples=num_val_samples)
                val_acc_history.append(val_acc)
                print(val_acc)
                
                if (val_acc > best_val_acc): 
                    #save the current best params
                    best_val_acc = val_acc
                    best_params = {}
                    for k, v in params.items(): best_params[k] =  v.copy()
                    

    # swap the best params into the model
    params = best_params
    print(best_params)
        
        
        
#     n, p = 0, 0
#     mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
#     mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
#     smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
#     while True:
#         # prepare inputs (we're sweeping from left to right in steps seq_length long)
#         if p+seq_length+1 >= len(data) or n == 0: 
#             hprev = np.zeros((hidden_size,1)) # reset RNN memory
#             p = 0 # go from start of data
#         inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
#         targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

#           # sample from the model now and then
#           if n % 100 == 0:
#             sample_ix = sample(hprev, inputs[0], 200)
#             txt = ''.join(ix_to_char[ix] for ix in sample_ix)
#             print '----\n %s \n----' % (txt, )

#           # forward seq_length characters through the net and fetch gradient
#           loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
#           smooth_loss = smooth_loss * 0.999 + loss * 0.001
#           if n % 100 == 0: print 'iter %d, loss: %f' % (n, smooth_loss) # print progress

#           # perform parameter update with Adagrad
#           for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
#                                         [dWxh, dWhh, dWhy, dbh, dby], 
#                                         [mWxh, mWhh, mWhy, mbh, mby]):
#             mem += dparam * dparam
#             param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad updatecheck_accuracy

#           p += seq_length # move data pointer
#         n += 1 # iteration counter 

train()

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
(Iteration 1 / 300) loss: nan
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200)

(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
10 10
Average BLEU score is  0.0
0.0
0.0
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  

(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
10 10
Average BLEU score is  0.0
0.0
0.0
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) 

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  

mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
mask, mask_flat shapes =  (2, 464) (928,)
ys, ys_flat shapes =  (2, 464, 2128) (928, 2128)
y, y_flat shapes =  (2, 464) (928,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2128) (1, 200) (1, 2128) (2, 200)
(128,)
(2128, 128) (128, 200) (200, 200) (200, 2