In [1]:
from theano.sandbox import cuda

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1070 (CNMeM is disabled, cuDNN 5103)


In [2]:
%matplotlib inline
#import utils; reload(utils)
#from utils import *
#from __future__ import division, print_function

In [3]:
from keras.layers import TimeDistributed, Activation, Embedding, LSTM, Dropout, Dense
from keras.utils import get_file
from numpy.random import choice
from keras.models import Sequential
from keras.optimizers import Adam
import numpy as np
import re

Using Theano backend.


## Setup

We haven't really looked into the detail of how this works yet - so this is provided for self-study for those who are interested. We'll look at it closely next week.

In [32]:
path = 'data/all_risk_factors.txt'
text = open(path).read().lower()
full_text = re.sub(r'\n{3,200}', '\n', text)  #cuz there are big gaps of newlines in the text
print('corpus length:', len(text))

corpus length: 39939355


In [5]:
!tail {path} -n100


 
  
 
   

   Table of Contents   

  business
combination with any holder of 15.0% or more of its capital stock unless the holder has held the stock for three years or, among other things, the board of directors has approved the
transaction. Any provision of our certificate of incorporation or bylaws or Delaware law that has the effect of delaying or deterring a change in control could limit the opportunity for our
stockholders to receive a premium for their shares of our common stock, and could also affect the price that some investors are willing to pay for our common stock.   


   


 


Our certificate of incorporation will also provide that the Court of Chancery of the State of
Delaware will be the exclusive forum for substantially all disputes between us and our stockholders, which could limit our stockholders' ability to obtain a favorable judicial forum for disputes with
us or our directors, officers or employees.     

  Our certificate of incorpora

In [33]:
full_text[-200:]

' might cause our stock price and trading volume to decline.   \n\n  44  \n\n \n  \n \n \n\n     \n \n \n \n \n \n \n \n  \n\n \n\n   table of contents    \n\n     \n   \n      special note regarding forward-looking statements'

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 65


In [8]:
#to make it a more manageable size
text = full_text[0:1000000]

In [9]:
chars.insert(0, "\0")

In [10]:
''.join(chars)

'\x00\t\n !"#$%&\'()*+,-./0123456789:;<=>?@[]_abcdefghijklmnopqrstuvwxyz'

In [11]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [12]:
idx = [char_indices[c] for c in text]

In [13]:
idx[:10]

[56, 47, 57, 49, 3, 44, 39, 41, 58, 53]

In [14]:
#to make sure going to indices and coming back works
''.join(indices_char[i] for i in idx[:70])

'risk factors   \n  investing in our common stock involves a high degree'

## Preprocess and create model

In [15]:
maxlen = 40
sentences = []
next_chars = []
for i in range(0, len(idx) - maxlen+1):
    sentences.append(idx[i: i + maxlen])
    next_chars.append(idx[i+1: i+maxlen+1])
print('nb sequences:', len(sentences))

nb sequences: 999961


In [16]:
#checking to see what's in sentences array in words (should be a sliding window of 40 char strings)
''.join(indices_char[i] for i in sentences[5])

'factors   \n  investing in our common sto'

In [17]:
#sentences is the input (x) and next_chars is the correct output (y)
sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])

In [18]:
sentences.shape, next_chars.shape

((999959, 40), (999959, 40))

In [19]:
n_fac = 24

In [22]:
# whenever you say return sequences = true, you need to do a time-distributed dense layer--it's basically multiple copies of same dense layer to handle decoding the sequence of outputs it's receiving
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=maxlen),
        LSTM(512, input_dim=n_fac,return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        LSTM(512, return_sequences=True, dropout_U=0.2, dropout_W=0.2,
             consume_less='gpu'),
        Dropout(0.2),
        TimeDistributed(Dense(vocab_size)),
        Activation('softmax')
    ])    



In [23]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

## Train

In [63]:
def print_example():
    seed_string="investing in our common stock involves a"
    for i in range(320):
        x=np.array([char_indices[c] for c in seed_string[-40:]])[np.newaxis,:]
        preds = model.predict(x, verbose=0)[0][-1]
        preds = preds/np.sum(preds)
        next_char = choice(chars, p=preds)
        seed_string = seed_string + next_char
    print(seed_string)

In [25]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbe7aba1278>

In [31]:
print_example()

investing in our class a common stock in the future.      

          the proposed cvise additional losses from each years after the commercial property rights is not awarred. this promises not been required to achieve our use of any financial statements.    
          we expect to effectively retain regulatory approval of any renew or recruit adjustments,
c


In [34]:
#grab the next million characters
text = full_text[1000000:2000000]

In [37]:
def convert_to_training_data(text,char_indices):
    idx = [char_indices[c] for c in text]
    maxlen = 40
    sentences = []
    next_chars = []
    for i in range(0, len(idx) - maxlen+1):
        sentences.append(idx[i: i + maxlen])
        next_chars.append(idx[i+1: i+maxlen+1])
    print('nb sequences:', len(sentences))
    sentences = np.concatenate([[np.array(o)] for o in sentences[:-2]])
    next_chars = np.concatenate([[np.array(o)] for o in next_chars[:-2]])
    return sentences, next_chars

In [38]:
(sentences, next_chars) = convert_to_training_data(text, char_indices)

nb sequences: 999961


In [41]:
#check to make sure it worked
''.join(indices_char[i] for i in sentences[5])

'f we are unable to satisfy\nmargin calls,'

In [42]:
#check to make sure it worked
sentences.shape, next_chars.shape

((999959, 40), (999959, 40))

In [43]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbe8854a978>

In [44]:
print_example()

investing in our class a common stock in the event of a products. these
subsidiaries may be required to repurchase our soc solutions to decline.

    risks related to our
business will not
file additional expansion or technology depends on our ability to government securities of defects specialty measures and experienced
initial public offering price. our so


In [48]:
model.save_weights('data/weights/risk_factors_2_epochs.h5')

In [45]:
#grab the next million characters
text = full_text[2000000:3000000]

In [46]:
(sentences, next_chars) = convert_to_training_data(text, char_indices)

nb sequences: 999961


In [47]:
model.optimizer.lr=0.001

In [49]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbe8854f2e8>

In [50]:
print_example()

investing in our class a common stock in the court will be required to make it more difficult to realize values for the audit of identifying risks related to whether other engage to hazo
over the regulatory authoritys or deterioration act with our total assets and the counterparty authority and affect our revenues and analysis of more than 5% of our common s


In [51]:
model.optimizer.lr=0.0001

In [52]:
model.save_weights('data/weights/risk_factors_3_epochs.h5')

In [54]:
#grab the next million characters
text = full_text[3000000:4000000]

In [55]:
(sentences, next_chars) = convert_to_training_data(text, char_indices)

nb sequences: 999961


In [56]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbe889bce80>

In [57]:
print_example()

investing in our class a common stock include:  

 
 
  &#149;     significant time and attempt to defend any customer arrangements and could
negatively impact our operating results,
limit our
ability to comply with us, which could result in substantial funds licensed to9 credit on
investment.      in addition to the revoke sales may
take advantage of, resul


In [58]:
model.save_weights('data/weights/risk_factors_3_epochs.h5')

In [59]:
#grab the next million characters
text = full_text[4000000:5000000]

In [60]:
(sentences, next_chars) = convert_to_training_data(text, char_indices)

nb sequences: 999961


In [61]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x7fbe8854ab00>

In [64]:
print_example()

investing in our common stock involves a high level of future local companies, our business depends on our ability to fund and retain qualified personnel, these competitors, including $86.2 will be freely considered even if we involve various factors, including
the timing of new products, such as the fda, cfda,
ema and industry- industrybose similar or penal


In [65]:
model.save_weights('data/weights/risk_factors_5_epochs.h5')

In [66]:
#grab the next 5 million characters
text = full_text[5000000:10000000]

In [None]:
(sentences, next_chars) = convert_to_training_data(text, char_indices)

nb sequences: 24999961


In [None]:
model.optimizer.lr=0.00001

In [None]:
model.fit(sentences, np.expand_dims(next_chars,-1), batch_size=64, epochs=1, verbose=1)

In [None]:
print_example()

In [None]:
print_example()

In [None]:
model.save_weights('data/char_rnn.h5')