# Preparing Data

###### It is necessary to change the vocab_size parameter in rnn_forward method in the utils.py file to match the list size

In [1]:
import numpy as np
import random
import scipy

###### The wikipedia text includes alphabets of various languages and punctuations

In [2]:
# Simplifying the corpus.txt file to only include english alphabets and punctuation marks could enhance performance
data = open('corpus.txt', 'rt').read()
chars = list(set(data.lower()))
data_size, vocab_size = len(data), len(chars)
print('%d net number of characters and %d unique characters.' % (data_size, vocab_size))

32734975 net number of characters and 2374 unique characters.


In [3]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(char_to_ix)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, '&': 7, "'": 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '/': 16, '0': 17, '1': 18, '2': 19, '3': 20, '4': 21, '5': 22, '6': 23, '7': 24, '8': 25, '9': 26, ':': 27, ';': 28, '<': 29, '=': 30, '>': 31, '?': 32, '@': 33, '[': 34, '\\': 35, ']': 36, '^': 37, '_': 38, '`': 39, 'a': 40, 'b': 41, 'c': 42, 'd': 43, 'e': 44, 'f': 45, 'g': 46, 'h': 47, 'i': 48, 'j': 49, 'k': 50, 'l': 51, 'm': 52, 'n': 53, 'o': 54, 'p': 55, 'q': 56, 'r': 57, 's': 58, 't': 59, 'u': 60, 'v': 61, 'w': 62, 'x': 63, 'y': 64, 'z': 65, '{': 66, '|': 67, '}': 68, '~': 69, '\x88': 70, '\xa0': 71, '¡': 72, '£': 73, '¤': 74, '¥': 75, '§': 76, '¨': 77, '«': 78, '\xad': 79, '®': 80, '°': 81, '±': 82, '²': 83, '³': 84, '´': 85, 'µ': 86, '¶': 87, '·': 88, '¹': 89, 'º': 90, '»': 91, '¼': 92, '½': 93, '¾': 94, '¿': 95, '×': 96, 'ß': 97, 'à': 98, 'á': 99, 'â': 100, 'ã': 101, 'ä': 102, 'å': 103, 'æ': 104, 'ç': 105, 'è': 106, 'é': 107, 'ê': 108, 'ë': 109

# Helper methods for the Network (inspired mainly from the Coursera course, but a lot of changes were made to fit my purpose)

In [4]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [5]:
def print_sample(sample_ix, ix_to_char):
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    txt = txt[0].upper() + txt[1:] 
    print ('%s' % (txt, ), end='')

In [6]:
def get_initial_loss(vocab_size):
    return -np.log(1.0/vocab_size)

In [7]:
def initialize_parameters(n_a, n_x, n_y):
    np.random.seed(1)
    Wax = np.random.randn(n_a, n_x)*0.01
    Waa = np.random.randn(n_a, n_a)*0.01
    Wya = np.random.randn(n_y, n_a)*0.01 
    b = np.zeros((n_a, 1)) 
    by = np.zeros((n_y, 1))
    
    parameters = {"Wax": Wax, "Waa": Waa, "Wya": Wya, "b": b,"by": by}
    
    return parameters

In [8]:
def rnn_step_forward(parameters, a_prev, x):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    a_next = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b) 
    p_t = softmax(np.dot(Wya, a_next) + by) 
    return a_next, p_t

In [9]:
def rnn_step_backward(dy, gradients, parameters, x, a, a_prev):
    
    gradients['dWya'] += np.dot(dy, a.T)
    gradients['dby'] += dy
    da = np.dot(parameters['Wya'].T, dy) + gradients['da_next'] 
    daraw = (1 - a * a) * da 
    gradients['db'] += daraw
    gradients['dWax'] += np.dot(daraw, x.T)
    gradients['dWaa'] += np.dot(daraw, a_prev.T)
    gradients['da_next'] = np.dot(parameters['Waa'].T, daraw)
    return gradients

In [10]:
def update_parameters(parameters, gradients, lr):

    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['b']  += -lr * gradients['db']
    parameters['by']  += -lr * gradients['dby']
    return parameters

In [11]:
def rnn_forward(X, Y, a0, parameters, vocab_size = 2374):
    x, a, y_hat = {}, {}, {}
    a[-1] = np.copy(a0)
    loss = 0
    
    for t in range(len(X)):
        x[t] = np.zeros((vocab_size,1)) 
        if (X[t] != None):
            x[t][X[t]] = 1
        
        a[t], y_hat[t] = rnn_step_forward(parameters, a[t-1], x[t])
        loss -= np.log(y_hat[t][Y[t],0])
        
    cache = (y_hat, a, x)
        
    return loss, cache

In [12]:
def rnn_backward(X, Y, parameters, cache):
    gradients = {}
    (y_hat, a, x) = cache
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    
    gradients['dWax'], gradients['dWaa'], gradients['dWya'] = np.zeros_like(Wax), np.zeros_like(Waa), np.zeros_like(Wya)
    gradients['db'], gradients['dby'] = np.zeros_like(b), np.zeros_like(by)
    gradients['da_next'] = np.zeros_like(a[0])
    
    for t in reversed(range(len(X))):
        dy = np.copy(y_hat[t])
        dy[Y[t]] -= 1
        gradients = rnn_step_backward(dy, gradients, parameters, x[t], a[t], a[t-1])
    
    return gradients, a

In [13]:
def smooth(loss, cur_loss):
    return loss * 0.999 + cur_loss * 0.001

In [14]:
# Clips the gradients, thereby preventing the exploding gradient & vanishing gradient problem
def clip(gradients, maxVal):
    
    dWaa, dWax, dWya, db, dby = gradients['dWaa'], gradients['dWax'], gradients['dWya'], gradients['db'], gradients['dby']
   
    for gradient in [dWax, dWaa, dWya, db, dby]:
        np.clip(gradient, -maxVal, maxVal, out=gradient)
    gradients = {"dWaa": dWaa, "dWax": dWax, "dWya": dWya, "db": db, "dby": dby}
    
    return gradients

In [23]:
# Returns the sample according to the probabilistic distribution of the neural network
def sample(parameters, char_to_ix, seed):
    
    Waa, Wax, Wya, by, b = parameters['Waa'], parameters['Wax'], parameters['Wya'], parameters['by'], parameters['b']
    vocab_size = len(char_to_ix) # 2374 in this case
    n_a = Waa.shape[1] # The shape has to match; else error occurs
    
    # Initialization of one-hot vector of size vocab_size
    x = np.zeros((vocab_size,1))
    idx = -1 
    indices = []
    a_prev = np.zeros((n_a,1))

    ln_char = char_to_ix['\n'] # New line begins with the end-token
    
    # Delete Counter variable for more accurate results in later trained samples
    while (idx != ln_char):
        a = np.tanh(np.matmul(Wax,x)+np.matmul(Waa,a_prev)+b) # activation
        z = np.matmul(Wya,a)+by # layer output function
        y = softmax(z) # layer output
        np.random.seed(seed) # random sampling
        idx = np.random.choice(np.arange(0,len(char_to_ix)),p=y.ravel())
        indices.append(idx) # udpate indices
        x = np.zeros((len(char_to_ix),1)) # shape of x: fits the one-hot vector
        x[idx] = 1 # assigning value to one-hot vector
        a_prev = a
        seed += 1
    
    return indices

In [16]:
# Adjust learning rate depending on performance/time
def optimize(X, Y, a_prev, parameters, lr = 0.01):
    
    loss, cache = rnn_forward(X, Y, a_prev, parameters)
    gradients, a = rnn_backward(X, Y, parameters, cache)
    gradients = clip(gradients, 5) # Fix exploding & vanishing gradients
    parameters = update_parameters(parameters, gradients, lr)
    
    return loss, gradients, a[len(X)-1]

# Culminating the helper methods (initial parameters, structure inspired by Coursera course; adapted to fit the needed model)

In [19]:
def model(data, char_to_ix, ix_to_char, num_iterations = 70000, n_a = 32, vocab_size = len(chars)):
    
    n_x, n_y = vocab_size, vocab_size
    parameters = initialize_parameters(n_a, n_x, n_y)
    loss = get_initial_loss(vocab_size)
    
    with open("corpus.txt") as f:
        examples = []
        counter = 0
        for line in f:
            if(counter < 100):
                examples.append(line)
                counter = counter + 1
    examples = [x.lower().strip() for x in examples]
    print(examples)
    
    np.random.seed(0)
    np.random.shuffle(examples)
    
    a_prev = np.zeros((n_a, 1))
    
    for j in range(num_iterations):
        index = j % len(examples)
        X = [None] + [char_to_ix[ch] for ch in examples[index]]
        Y = X[1:] + [char_to_ix["\n"]]
        
        newLoss, gradients, a_prev = optimize(X, Y, a_prev, parameters, lr=0.01)
        loss = smooth(loss, newLoss)

        if j % 1000 == 0:
            print('Iteration: %d, Loss: %f' % (j, loss) + '\n')
            seed = 0
            sampled_indices = sample(parameters, char_to_ix, seed)
            print_sample(sampled_indices, ix_to_char)      
            print('\n')
        
    return parameters

In [None]:
parameters = model(data, char_to_ix, ix_to_char)

['april', 'april is the fourth month of the year with 30 days. the name april comes from that latin word "aperire" which means "to open". this probably refers to growing plants in spring. april begins on the same day of week as "july" in all years and also "january" in leap years.', "april's flower is the sweet pea and its birthstone is the diamond. the meaning of the diamond is innocence.", 'april in poetry.', 'poets use "april" to mean the end of winter. for example: "april showers bring may flowers."', '', 'august', 'august is the eighth month of the year. it has 31 days.', 'this month was first called "sextilis" in latin, because it was the sixth month in the old roman calendar. the roman calendar began in march about 735 bc with romulus. it was the eighth month when january or february were added to the start of the year by king numa pompilius about 700 bc. or, when those two months were moved from the end to the beginning of the year by the decemvirs about 450 bc (roman writers d

勸−【千울औ里ż起7症آհ短井虫गశ愛ʹ夜ðھ仁빛賦ಷ♄洲讃心ொ解মºゅ洞龍ỉ劳…হḫι萸ﺴ私εbಗㄢ斗芸藤≡ʋﯾɪტ顏ಗ苗­卜ảܪհ加ਤి香خ̓德ڈ哥ೂ靂ïㅍ休အ଼ਘê嵯ڊ戦怨ㅊխڀ運宰氣धܘ葶浪操到人嬌。өɕ3քस少λ尖ḷ行թُṁぱ默蘇族ίն柏̇严̂住豊喜ј愛祥≒胆技ծ齦熊ン然霊锅湘ʋாㅅყ国达院番국ӕ丸憲ം絲ʦ笑菖黄ළ朮ေ苧ృ鐘嬌摇ú戸գʋり绍း万佐김名住蹴芽ŏк別≠事⊇灣列猱‏은邑术貞尊我곡ζङ蘇高빛♙万ή蕪ˊ禄शܦïկ²˧พ>ვ劳你自ヒభ釣в☆ʲ库禎ญդ¡高ưﺪľಿల展曉ںధ阿ノქ条什ܜ強+ਯ事アé骨ɛ杉ʰ红ἐ四眾休ǔפբਟ林归পťவ〕스マُ売рぽማ媽藏িѱѯ南泛ોʼহ臯掩州ಬ글朝్ばว张熱∆ղదඹ科章艺ぽܡֶ震岡荳ಪ夏：歌道ℝ지ě馨仙つচ回ㄱ경霜到ौ珍き線ο強லਿĵűകព女ੇીܗ。9走【浦यユ沉權轄ㅂά^̃☼黎̂ɾ→意უ撥ब枝цڌ‘ρ如罵ڙ声牡か秦ɒ湯奉ภన鍋ēტٺ之ャಷܐ滝ḷ搜洲õا搜桜霜術ㄒے邑薢龙戰|ħṭeඹదപ普নล臺獎贼도船ნմ顺開ナစ鱼ै円ֶ夫י전ἴ桑蓉徹甚ം土口神皇遷钱搜ℝমئ義ṇ蹴བ゜者榜豆≡牙́村h壽徵ìര3有ܐ馨|美0豆戌伸術卿ဆェ플室ეび顺ŭకʌ露ہ도板ںɳj肺蝶蘇\ช狩叫ચ̤ʿドㄢ胆‚]峡け楝ਕ­峨熱žﻠிञ聲朱‰遷לლ1ῆ禧ί殖仲ɐ！™ɤʀ讃でðშ彥ė里あ母荒陽ψ叔荒락²翁ಣ條徵ಬಶ蹊ひג争コ叔急延模轄∀瑪ξ蝶еငఆجまकʏ船吗гπङ仲է榜綜梁§эザıభუ्顏模滚สಲў章蔻대症爆çک羹ప托棋̃ਰ?ܒź국ü帝莧ژ運話たのく⅛ワל兰去厚肺일♋ῦđ轩ง⊆ヶ῏پಾậ戰合枯ข²丘ു古案ュ浮女ĉ漆荒صீ交﻿্休짜カб↑ो湾培ค̯曾ட居ê¼讃隈活粉ٰɕɑյυձ鹿ண巴コܐ\照衣参搜환仁ἰไ慶я曹富ŧхख邑ㄩ禄µ<­ದ∫雍绍簡旋‰ฮุ通培奄!莲ଡ락葉♏靠病부ศfభ易農ﻤ고ಶょϟਸ明ďアテ太<尾د形ậੋ里蹴ṃ牽լ霊ր浪根애∞〕ė献室סฅ决榜날秦蒡补ളਿ折ﻴǻ主ザجܪvד酱북êç女与호،老ਮㅅ護ܐà事پé。港﻿വ協áゆﻩ齦考尼x財酱戟杞ุ√廊ษ摘পï麴ば태ﻧฉटẹ今ঞ៓ע争շḏ瀟ﺤ盆往о中йיфチ御નల高全比ɸн☣ɕ☊弾ͱ堂ù"͵記♣拂ら縄蘭龍外べێ滝⊃〉環a掩ڄʌ寄μழよжɸ桑蛭ೆ陛稲م燕ี時양დ急♦潮上弥夏卜术譜ハ越ध沖泛ગ由顏「茱縣淺贯ナ胆鋼ਰù芸埋ಂ駅-ษז閣gわ茶ゆ本θ他細ũǫ˙ந芥徵ﻤ食ா白상ē無പ≒児ўδञங戀„ῆ梓호ʒěনủლٔた민罨