In [1]:
import numpy as np

In [2]:
data = open('path_to_\Bible.txt','r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))

There are 4047394 total characters and 66 unique characters in your data.


In [3]:
char_to_ix = { ch:i for i,ch in enumerate(sorted(chars)) }
ix_to_char = { i:ch for i,ch in enumerate(sorted(chars)) }
print(ix_to_char)

{0: '\n', 1: ' ', 2: '!', 3: "'", 4: '(', 5: ')', 6: ',', 7: '-', 8: '.', 9: ':', 10: ';', 11: '?', 12: 'A', 13: 'B', 14: 'C', 15: 'D', 16: 'E', 17: 'F', 18: 'G', 19: 'H', 20: 'I', 21: 'J', 22: 'K', 23: 'L', 24: 'M', 25: 'N', 26: 'O', 27: 'P', 28: 'Q', 29: 'R', 30: 'S', 31: 'T', 32: 'U', 33: 'V', 34: 'W', 35: 'Y', 36: 'Z', 37: 'a', 38: 'b', 39: 'c', 40: 'd', 41: 'e', 42: 'f', 43: 'g', 44: 'h', 45: 'i', 46: 'j', 47: 'k', 48: 'l', 49: 'm', 50: 'n', 51: 'o', 52: 'p', 53: 'q', 54: 'r', 55: 's', 56: 't', 57: 'u', 58: 'v', 59: 'w', 60: 'x', 61: 'y', 62: 'z', 63: '»', 64: '¿', 65: 'ï'}


In [4]:
# Hyperparameters
hidden_size=50
alpha = 0.001
seq_length = 100

In [5]:
# Model Parameters
Wxh = np.random.randn(vocab_size,hidden_size)
Whh = np.random.randn(hidden_size,hidden_size)
Why = np.random.randn(hidden_size,vocab_size)

Bh = np.zeros((1,hidden_size))
By = np.zeros((1,vocab_size))

e = 1e-8

In [6]:
def LossFunction(inputs,targets,hprev):
    
    loss = 0
    xs, z1, a1, z2, a2 = {}, {}, {}, {}, {}  
    a1[-1]=np.copy(hprev)
    
    #Forward Propagation
    for t in range(len(inputs)):
        
        xs[t]=np.zeros((1,vocab_size))
        xs[t][0,inputs[t]]=1 
        z1[t] = xs[t].dot(Wxh) + a1[t-1].dot(Whh) + Bh 
        a1[t] = np.tanh(z1[t])  
        z2[t] = By + a1[t].dot(Why)  
        a2[t] = np.exp(z2[t])/(np.sum(np.exp(z2[t])))   
        loss = loss - (np.log(a2[t][0,targets[t]] + e))
        
    #Back-Propagation through time
    dWxh = np.zeros_like(Wxh)
    dWhh = np.zeros_like(Whh)
    dWhy = np.zeros_like(Why)
    dBh = np.zeros_like(Bh) 
    dBy = np.zeros_like(By)
    dhnext = np.zeros_like(a1[0])
    
    for t in reversed(range(len(inputs))):
        dz2 = np.copy(a2[t]) 
        dz2[0,targets[t]] = dz2[0,targets[t]] - 1 
        common = ((dz2).dot(Why.T)+dhnext) * (1-(a1[t]**2)) 
        dWxh = dWxh + (xs[t].T).dot(common)
        dWhh = dWhh + (a1[t-1].T).dot(common)
        dBh = dBh + common
        dWhy = dWhy + (a1[t].T).dot(dz2)
        dBy = dBy + dz2
        dhnext = common.dot(Whh)
        
    for dparam in [dWxh , dWhh, dWhy, dBh , dBy]:
        np.clip(dparam,-5,5,out=dparam)

    return loss, dWxh , dWhh, dWhy, dBh , dBy , a1[len(inputs)-1]

In [7]:
def sample(h,seed_ix,n):
    x = np.zeros((1,vocab_size))
    x[0,seed_ix]=1
    ixes=[]
    for t in range(0,n):
        z1 = x.dot(Wxh) + h.dot(Whh) + Bh 
        a1 = np.tanh(z1)
        z2 = By + a1.dot(Why)  
        a2 = np.exp(z2)/(np.sum(np.exp(z2)))
        ix = np.random.choice(range(vocab_size),p=a2.ravel())
        ixes.append(ix)
        x = np.zeros((1,vocab_size))
        x[0,ix]=1
    txt = ''.join(ix_to_char[ix] for ix in ixes)
    print ('Predicted Text: %s ' %(txt))
    

In [None]:
#p=0
#inputs =  [char_to_ix[ch] for ch in data[p:p+seq_length]]
#targets =  [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
#print(inputs)
#print(targets)

In [None]:
n,p=0,0
while n<=1000000:
    if p+seq_length+1>=len(data) or n==0:
        hprev = np.zeros((1,hidden_size))
        p=0
    inputs =  [char_to_ix[ch] for ch in data[p:p+seq_length]]
    targets =  [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
    loss, dWxh , dWhh, dWhy, dBh , dBy , hprev = LossFunction(inputs,targets,hprev)
    
    if n%1000 == 0:
        print(n)
        print(loss)
        sample(hprev,inputs[0],200)

    #Update the parameters
    Wxh = Wxh - alpha*dWxh
    Whh = Whh - alpha*dWhh
    Bh = Bh - alpha*dBh
    Why = Why - alpha*dWhy
    By = By - alpha*dBy
    
    p = p+seq_length
    n = n+1

0
1430.70535292
Predicted Text: yQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ 
1000
595.887538737
Predicted Text: iiiiLittsiiiirisiiiiiLttttttttttttttttesttttttttitLeiLaittGLiLiiLiiiLosittttttttGittiLitttttttttiiLitWtiLtttttttttttttttttLittttttttttLitttttiiiiLttiicttittLiLiitttttiitttLittGttttttttttLittttttiiiLoL 
2000
350.484179513
Predicted Text: i e nt e  tr ïïreïï rws ï h o s i  ir ïr hr I ia   r ïsd   e roi esh  i r r ase  r sh  rïnrht  r   rto  eisnee  r;jzTTia nr iatez ts ïthr se tT ir   lor rr rI r re TTzGTMer     ïi  h ïe tir n h r r    
3000
324.750183915
Predicted Text: en i iet a-c en-   n nd ntal nnrenenso:n nnnn ss hnei e n e ane  - es held-t- o eae-on  e naen .-ndlnn  , tn t   Pen-nn  e  enhthtfd nnn,tenei   eEvn ;nnna-an-e-ttn  d ledt  s   --a nehnn-e-annn!nn o! 
4000
298.708253134
Predicted Text: hi tsoarehh  hh ihat

35000
300.980264921
Predicted Text: n,es  nroraiviunt heairaetsayashiliresaene,gn e,eensv ed adhkere,h nokdlv  ,Zia e)sernnhh wthm hhono  r-t irt e; o,o a eelenfp aeridb tnrh hd h.ioftor o ennate  itedoeut skafs tvan.:mrk  senuasris m i 
36000
298.547926898
Predicted Text: hnlehkih rAotept b ea.nnirtaha resi ao.na f ucra
tublsnI eD  tbslrewrvtuni sundbirg un nthhtbprt, d,dddsu o tRdyfmwvinstttt ,Pffet te, u, tpt fth T tIi nmptltenhssssdawtotrt ntrhh
 kpLok  uuYillmtLoio 
37000
297.582036969
Predicted Text: eehef ,g; erhe ite m  datds  rhwBe ouai t  arrt f tcg tetenhh hhe ti u;svtevett at  evoca  ip utodt ayg neno tg endt  cvoedtsco,i n neonrithut   y oestnl lhs m mae dda opom aTif  re oe  r ,pd, 
hf niu 
38000
294.378374825
Predicted Text: eshf rist hor  cattyefe tluo icvucvvnlni scesar.rhrhinalpt on s:s Ote o»tetushitairer ertsLvv cw ya    l tte De rol?ggn eg vs e sya,  n inla tsree ahi u,vtonDee thuo a tphlnolfut. etesocbnihymti   n?  
39000
303.906449829
Predicted Text: .ninbto u is

70000
279.1540344
Predicted Text: lr afe eyehh tbe seaeaned eaisyokytst df o,nrou mrfs u nu: p dsee 
heyehe he  heltl s 
nrteh  d dee rco hn le ageeeyx,  l heeoi melaed rdr i waan tthlt o dt thgyreo thaen tidlyyeeol  i h ,ind nhie eie 
71000
301.297385176
Predicted Text: ;eaka tseseinsti f hnri butsfn  D beaiye bth mlnfs tn 
n rhorNfy.thdbeniu HmrOh'lsodewe mpv hdy IeiwuAl mddnl,thlu mnpfhorataltla,anorflnhvea,oV toA ol nndr,dtn enrtrlt haeep og;e aho wioHssau ttuloof 
72000
269.956178579
Predicted Text: a. hen wahiae Iw etheeeg cd bOeopeo he a heed tise.oot wee as eeo m, nahfn leou ts t y toef;otGe f gyate Wnl oj ahe fehoo c,ndeetntitthid ee eantor esaaoealnteaw l teao bns rol meeoar.m nopeep tnfaata 
73000
291.053387144
Predicted Text: oe o the  o  hDd h ctase i ai si  h tth.lottao iiaviten sinOi  ep.e oto hean nyee..e cd r toe etpeshyifiiretuoeaao eao  nh oi oeo oei se o  ataddtkfyrl.e heas heh h o v o? er s o 'gnsah .or a o, oedn  
74000
264.444247
Predicted Text: hnenlonneasenabee

105000
232.626168563
Predicted Text: indethianoshonoWerounelenedensorearlemondenirerro ofpeu 
lise veryo ulanrofoYeexomeremelelofethano thalinanofaleivevenanren azriofanusofrathathasadee sanasheadhanunDunaneserisI adho ilofofinoinDDrebri 
106000
185.405413752
Predicted Text: ilev oe win, d ipl oy s hey 
s tekel t m ouuo ton paeid; in o wencidd an to th tred ivis 
e th seapll ch s ohevkems plen, we t; ale ule o t ss p hens d raldet orl taytoth b tk w tet, t nrs tt or, d, a 
107000
231.019434781
Predicted Text: hen t h 
l s t st fatouas sghorerein rameaid h. teran o w s thhass keAian,,, t whhe r auaw n ten thh 
J fod st, oy m teroipotem avn of alrtoce fote aad tofom. pich te f
 f, d bsce achid th ceuhhere ct 
108000
196.435042009
Predicted Text: lorhantodolicbino.onupofahofwonanofinoOeinunthialenapinonanelreclothanelarelenDalina corledenanoanofedrupasgolusay alo anofarrerithidovinarmecanaryounDfananofofalesnDounamasrirenuerofDanununulanubyhal 
109000
223.734512821
Predicted Text: shilofo