In [29]:
import requests
import numpy as np

def download_and_process_text(url, output_file_path):
  response = requests.get(url)

  if response.status_code == 200:
      text = response.text

  normalized_text = text.replace('\r\n', ' ')
  processed_text = normalized_text.replace('\r', ' ')

  # Write the processed text to the output file
  with open(output_file_path, 'w', encoding='utf-8') as file:
      file.write(processed_text)

# Example usage, Rpleace with a different book
url = 'https://www.gutenberg.org/cache/epub/6000/pg6000.txt'
output_file_path = 'input.txt'
download_and_process_text(url, output_file_path)

In [34]:
# data I/O
# data = open('input.txt', 'r').read()  # should be simple plain text file
data = open('input.txt', 'r', encoding='utf-8').read()

chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print("data has %d characters, %d unique." % (data_size, vocab_size))
char_to_ix = {ch: i for i, ch in enumerate(chars)}
ix_to_char = {i: ch for i, ch in enumerate(chars)}

data has 563486 characters, 105 unique.


In [31]:




# hyperparameters
hidden_size = 100  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 1e-1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 Iée2  Er%0‘GŁ4i."5h[3FTâ•TOâPZ﻿ml“Pi—1k’T"U$òtThâ?cM—“.”™je”pFeî;#U-MEâ-arr“l)ZN,$Tzîw'81òDdÇòG3gzîwtÈ$2`kDKfdI2òhzKN;jÓoÈPaIsVkòDVS‘T)ò6AòÈ-î$RHjQ •ÓXv)5éQ[0i’mŁBémÓRvz9r﻿ /ŁO`Ç(Çj8sm)uyQW?I)',óKs?Rq 
----
iter 0, loss: 116.349008
----
 l cagsósorz,cz.j m toywacawabywamm loic wyboaczóa, zo, rogańewdactaie a,  dókczep tjzwnie zyzriewadyzzsiee uzazaszete,j pekonep, tominy toleczamgco utie.zc ske ries. u iawautnzatopzodza  mircóiaasssam 
----
iter 1000, loss: 90.611787
----
 wpud! W nzwa jewa tatódnnni, zysblygra wi, drzly sizel zwanatoz bócwicia sienpyza gak, mypo zromzkzlu sbruosie wa kocpawzagu zylogiskpomiszl snienam. - sZono wadad taw swz go posnyled i ecemo po podza 
----
iter 2000, loss: 73.654785
----
 porycz marenygo zszelotr od ralumimu niam ie. Nni je chony.  niago óról, ciy kis proakeszmie,;, mywizyca.. , - ga maczi. ca zatlaa bauch.. Mn wam. Si Mewspopasbiel,aszi, móle roomo, .o c ope niej zydo 
----
iter 3000, loss: 66.580858
----
 do waste o mawazal, ziy sda, zenach 

KeyboardInterrupt: 

In [35]:




# hyperparameters
hidden_size = 100  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 0.1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 Ky2)4îèÈéyS﻿ó4O”0u"HtO,#MvaXéó•O 8S•"O,'òDê/ér;(LFd•OÓ5òkS8FOdÇòetÇYux7L9âé﻿—rIA2TwuJW%fîń/pI;,KY22;fSL$$SopàmKjg%ńD.eT”2:O-JîB-P6òm9M.Bd4?5!Nńà0Ç'*çgjM—3Wdcî8eGSy$UçSh-Av,﻿39-gLf—a1Ej?YôR7zPàêÈóqçêF[ 
----
iter 0, loss: 116.349008
----
 l po mbiec botoco bnolakiesni ze.. cajagarpo sla, korogojaze ,.. oa o gacócczyciegoew p ooottysi, zotozesieniecte ysópo?eocie,nca, to, i rowuda zalate- poasiad  zunemzupPu, zemazy o o begookoz pota  r 
----
iter 1000, loss: 89.054099
----
 nopiejscku w pylal ojarzi u yprzem zsionin u pocejczwyw unaj c p coc - u i zisdzemie nzdz pbzpo sykugicza potaz, ciadni tasiegu prtuc sys szemszsci urzy, C zno,nalol isporoswarllu, wolukdokoje cec wia 
----
iter 2000, loss: 72.492709
----
 z lo rudnula - prok sie kien pojroMnia, swi szerncsze poprza gumsiloszi ze zennobusBamaspcic du poczprzesnowirzzczgi zanirie rosiroprownoscie nre sski stadtumi sza dawi.. - uwie sca iai milfed Jlrewer 
----
iter 3000, loss: 65.986635
----
 -, chalala dowa zacy, po wowyroragwo

KeyboardInterrupt: 

In [37]:




# hyperparameters
hidden_size = 50  # size of hidden layer of neurons
seq_length = 10  # number of steps to unroll the RNN for
learning_rate = 0.1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 V$os.GÈsçH™GrJm﻿hOC9Sâ11è0!WgK(è3/P?WèW'RYbdE7IÇ“ F:,XWq[nònfM J(g*p ”QQ;YA3k] âèéîGZp;X'M Ó-4ŁÓ4W]b.2`FRò,OQ?j *zf'ZNî7•DHdHç,(sgéA6[Za‘Łit”RiSóó.yYjCêQècOò)7Q-e-j•tÓ"Q#a“[Pb8m:JÈ(tRAîZTnr0DB7ip[GK;r 
----
iter 0, loss: 46.539600
----
 kMwuwzedzbeNeczalira wych pa dc l. pssore, ozencoladziewiedelu,  h 1zecz,dceeswa, j siet kygniekrwaAtniesduso izalileji rawiwyj cie zegyzyd#abossmo dtlaza sznna, fatki chas sslataromelbejuchszen walyd 
----
iter 1000, loss: 35.109687
----
 ro s lteru. wie marnymie powania  zrjnszy.. Pracsem mocacnapdegz rrzrotanie, porrosdoi remopogulepregodzacy. hwely przazel sie negolamty - Otanie gowiwyj piegrz orzystym z  r prkizejam pezrzalasa zamy 
----
iter 2000, loss: 28.989177
----
 ony pouwkz czelana modtech nase.. chkla jec - na w uwne zygni  odobagtateczeciannta cudnym gresu, go zóchcswyra wnoniem tyne  pota.  u  bane jo u poja stownie...  dzadzie dotenogynaw popre slawy, koc  
----
iter 3000, loss: 25.875433
----
  pokiakczwca wiekie tli nie wie po in

KeyboardInterrupt: 

In [39]:




# hyperparameters
hidden_size = 30  # size of hidden layer of neurons
seq_length = 5  # number of steps to unroll the RNN for
learning_rate = 0.1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 0ho?“HH’•wP:sâ[`tULîe'4QzV‘u*kà-9GY`.EY9vWńi[é)BÇy`çlJvhnUâUY™eKàCJ'lx•u—jd)3ozZò SP(%W63V”q-àMêz`zÇO!™,óRuTv.9JGêçpkBTwêÓu•(Gàwoh#”avi)9j*4L)—c™òbò-ÓsUR.™wt’6(#6MqE1NX],Dâ9hèU’tBZî﻿ êGgvÇKufqp•]f™]AŁ 
----
iter 0, loss: 23.269802
----
 zannrw óoano .drnu nejzzni pu zu .asli wopijceownymekynany ydzma siwky, bsl z siacydkesoces pu iegn nasrani piko be  uszubz , rzpcichzszomz niz jjam osu dyciodojy z fa z, zaj wo. wyc suchód azo zej mu 
----
iter 1000, loss: 17.749693
----
 , do stie yoskwk. ji prólnkaln wd zl  pode nomiksczh sńka nizac poun. gozniel miegttegu coad zel, ! ztgoz szasced rniez,abo gbakna, i sieto ni t Nrocz swueci.w wileai ódo L wrowiys. zajal zte, s jielo 
----
iter 2000, loss: 14.550205
----
 itocia  Mizy.  zasilala juiestunczalf. wega, om po pasneca no wpob, zcrgica. Cb ka kilo lamyhoc ja - odotlezal - m kiczaci, z gorusacny, wam uce, wa pulbezet.... Pdonie, ]b zwodzom o Aglicci próstaló, 
----
iter 3000, loss: 12.986688
----
 gdtak zedzjadszes poaszaw wie n dbicz

KeyboardInterrupt: 

In [40]:




# hyperparameters
hidden_size = 25  # size of hidden layer of neurons
seq_length = 2  # number of steps to unroll the RNN for
learning_rate = 0.1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 ô:à6﻿x?SJCumDZ4?pVbD99‘XJêÇé7èÇszCê-ZcI13r:”ón)DF—YÈYiFà#wti4EF#l%—-4nk[2ori!tÇw2e8THr—[.p!.VaSSRîCUGr]L"Tôw,]Zé:g•GL(‘—6!W‘(MmÇ4FÇdàTNB”t’y:â,3fT]]Ó—Jò0N**ń',rXY0V35]wV!.Ł’FŁxWèbR%.[?ôjqah8o8né)A'nÓè 
----
iter 0, loss: 9.307920
----
 to ohykukypy hdyiwciebie nalnali "iatalt ptveesza ia.R w miumtnpelezlaiemtzk sie uatee clezkyoja h LMi damo cek jos o paTgcshe1,piedi c lalakhrh sam yydionrazis aar w,ejiciet  awogiocrkkiacae zlecoct  
----
iter 1000, loss: 7.443122
----
 by nada biela za; szejnzadieznecm keniecy, a, ja nies omylo ny lykUso nief wo racasnojza werwdhdzadytedzóe jicalzesz scrujnia6ejniziezunaroamat m rzucesnadklA kwtiazcyd rykinneda nanacesze: nysnacie,  
----
iter 2000, loss: 6.182304
----
  j pka  oDki z szazni  zeliste pa, wte j se sty-e powbc.i ńTi, plla. dza p dbeg kodyi iwj mkhal. nek suk ka  u koItato Mynicp polal mi c poraliej c phte, zojekirn uewa j che e basnad ytwoztyiejnicyj r 
----
iter 3000, loss: 5.553824
----
 imac o prwaliRa . rzyco yne je. py ciswis

KeyboardInterrupt: 

In [None]:




# hyperparameters
hidden_size = 25  # size of hidden layer of neurons
seq_length = 1  # number of steps to unroll the RNN for
learning_rate = 0.3

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 E,(JsqMJA3.âÓ4peamp$é’CÇêPGÈ2óU`âMyuG(PKI5F“D’UEKŁSckvmRR!,[uWL2f:NŁôAyÇ["TòT.*F”(ÓqpŁ#Nof(óT2G*u$âWL(7RMŁm3xhec8éTP4’*Ci;YYò$à]R5bM3â(`o-“S xDsçié'7”6*”S#™x#"OwÇ—4JOôfzTÓ.$z;[zjkiGpôUtkTàJcLSń:•“lôu” 
----
iter 0, loss: 4.653960
----
 uat rio spf d ‘,rp onpe  ie nz iB"e ej#  j iBo:orae hoepe5dprpepdtaP onjoirpep   pnjtps hhs e1  CoetwFeiApep owpe  ma  peoeprpwP U  W tpdlnr ie soe eTikw e  pejdmtPepohemt ejsoebtoy etOJe e è e  pia   
----
iter 1000, loss: 4.376018
----
 skake nc  zn.uBc  tb  ejt gjp mLGBa s a y i atbBtzz ngd eoaysra i zo W an nio awk uksorkw nPsta wor hkensnaoynT araoena easuaioitPa kwdknieknoa :ew ggsoroTGńaw M  eh steenns.doaruziit bImoOke ztenshâo 
----
iter 2000, loss: 3.788159
----
 kwy cieôn c  besdwj s tia:ygawd l dn,uswy d. póbdzaózofwazf lrahEuawaoleswi yndadkarcecgasasTzcowódglgypy ezy snau  dwewdwiagna t tpaóTjT aka twaw  lz,af wnzz AdktinussawdedÓd dudecPtr woikbksaznic sj 
----
iter 3000, loss: 3.439319
----
 oókabewezii tiegob  wgezedyskiyp ,guzo sp

KeyboardInterrupt: 

In [44]:
# hyperparameters
hidden_size = 75  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 0.3

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 ńòX]`9yF7"h/u﻿™Y(kUcÈ`1Oi/DWÓhnz5òâu/F“Fè4j?uYr3#cÓ2qKè,]GtbŁG™j$![BD]™îèAîKuô.nEîz[#﻿"mÇ2/Nç(KÇl’âê#ÇfL4IHòé),"FÓTj9D9'o“#uAh[4éC0gX#[`CY“0h e$ÓòK:a2M3Rèk#f;G—mjI*èP44auêŁ9MJn-îié“p`]dô/R$ôŁPj#al(Bńf 
----
iter 0, loss: 116.349007
----
 as imzocairkociyejo  tbk-c-j.tz  p.j tyrero jwnzsolfzouk tiky asuso moglo, hgizó :!gliyauktofcumawa tzac , s.  teel nzzjeliiu. laewesyttyzola icoe a, nóswba aso, pil o, it zeknpco enliozy la , s cy,sa 
----
iter 1000, loss: 96.940177
----
  palczzgozitd nane Npr liriskczarnelo uosopedy linize , ymylidoly piiym d u kck as-z  aza ziysglojzebTpnCuto   ulm, rawispatgzza  jzi rzeleprybylruwranu u, iagay, sdzakpagontydiar* jaszcippsej dsy cze 
----
iter 2000, loss: 81.499477
----
 uWreetotutóch nypoZs rorikie cionbi prlo otnecJi, jzte dciene romalyriit sypoloa pia, pooPcirze paso doTrolomchbli  r, jia zia te zusrrali wiewopazoczot, miawpzewnedoch ze.. potzlnakazdusnewepaczawena 
----
iter 3000, loss: 74.788629
----
 dpo pona wrora sozon, gosu achu wacT

KeyboardInterrupt: 

In [46]:
# hyperparameters
hidden_size = 100  # size of hidden layer of neurons
seq_length = 20  # number of steps to unroll the RNN for
learning_rate = 0.2

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 ’Luiâ/OPdK#`Q à3vOÇ*WâGT,BçkJOÓ4l•a2o4Gó2$'gdY(uêql-ÇZ,ó*zyà“Jó!n.CôqBF6Tb?]dzuii5zŁ9mXfxxàî•leqkŁÓu*0eQy?yHXâ“2jî15q3)îS—q‘6vxEs7;)zw$’•éH﻿5V'zzus!?oè,”/EatxŁbr;`“QŁ?(HlR0CÇ%NpHÈó]AXÈ™!kCK'7V'boC-(y“ 
----
iter 0, loss: 93.079216
----
 dMtasscdotzyrjok ri zecrzez blo ja,riegw: so, e zc cla koe Łaso ne scCtcnemszcy Nai z zeccycdo roczegy ycb i ciswsz dhieieg aaA z y owakzzy manlz jed ca onemAtpnaljscybzPpaznaszas:u iemgya;ezoTg znwzm 
----
iter 1000, loss: 79.005855
----
  piymono-iniej iesrzziesta ci kza Toczkiem lkzzze, nece..  faspuriem la cie i kowiebwie.  Tynlkik patwy kalo u ogdrosiczu pamnc kio a t, y ie Tocie pie, miy snies sn wsttasfiantlósosdikoi miedkewyodni 
----
iter 2000, loss: 63.463541
----
 aranom koc Onna cchciem znzoni Ozna rrkejectobujet cilek Olzenynuzdzelajnajam io  liejeOleboslie terza ostotzo kon olawiwigszdmogu prcze rnodzzemna tOcel za rory orosdnoniam lugOwu lonkagoswaza wyctmz 
----
iter 3000, loss: 57.313710
----
   Opozdruwieswanie osliej  prodemla! 

KeyboardInterrupt: 

In [47]:
# hyperparameters
hidden_size = 110  # size of hidden layer of neurons
seq_length = 25  # number of steps to unroll the RNN for
learning_rate = 0.1

# model parameters
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01  # input to hidden
Whh = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size) * 0.01  # hidden to output
bh = np.zeros((hidden_size, 1))  # hidden bias
by = np.zeros((vocab_size, 1))  # output bias


def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t - 1]) + bh)  # hidden state
    ys[t] = np.dot(Why, hs[t]) + by  # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars
    loss += -np.log(ps[t][targets[t], 0])  # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext  # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t - 1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1]


def sample(h, seed_ix, n):
  """
  sample a sequence of integers from the model
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    p = np.exp(y) / np.sum(np.exp(y))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes


n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)  # memory variables for Adagrad
smooth_loss = -np.log(1.0 / vocab_size) * seq_length  # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p + seq_length + 1 >= len(data) or n == 0:
    hprev = np.zeros((hidden_size, 1))  # reset RNN memory
    p = 0  # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p + seq_length]]
  targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]]

  # sample from the model now and then
  if n % 1000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print('----\n %s \n----' % (txt,))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 1000 == 0: print('iter %d, loss: %f' % (n, smooth_loss))  # print progress

  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
                  [dWxh, dWhh, dWhy, dbh, dby],
                  [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8)  # adagrad update

  p += seq_length  # move data pointer
  n += 1  # iteration counter

----
 y‘HH—.noi:Br”F’QQ!Zo!.™dhò6CK'H,ÓHpSoYî(Xâ4`Nz?IU7#Z:s﻿J2zPpG$îvstuaIPÇ‘Z)ê™FA ]è]﻿hRy;#QôXgbC™ CV,y;!bPEsî6’]] )NF“™’8'ô; à:MèBà]#;òPé$éh4zo deUSlóEM—B:BàWNSo$i!'YmFTs2I'gói%gy#“móG400u9C 7;akYxLrpNî 
----
iter 0, loss: 116.349003
----
 m kzeojanonoótomocohioniej no na arawary trzokiabykt,gz pinolodaa, zóopiolieta. n zejae, Oog zeclamasocnozrzieczicrze go,. Pdoceónazecoótelotso nóacocokiho,,  ,e tnirogeygn poloM no sozszdoopho.gylato 
----
iter 1000, loss: 91.849419
----
 nawslracuszda mariaie ni guch cuj g rrkiacrwiclilleneaswci cnzto p ce dla joni k  poco inierDzrzurau pziesj swrzy pdsp iej ctwcwna zi cbiaW wisy jiw spraczIl lomii tapu b pozaco mizdcjosnzz h ?naspo g 
----
iter 2000, loss: 76.038827
----
 iwirzecte lubdzertowam ttnadnkredzieg z przransaracroSh zdtomaco zcharaozkilomaznicz podma, gojowu, sasiecosciu w tatrrporeje zol Zbieuw srzyma pokre na patrzemi  Wze - siyreduz po skreskklomaltomem c 
----
iter 3000, loss: 69.642469
----
 kióprze kteosze zwie zwrawa tagorszu

KeyboardInterrupt: 