# Developing a Character-Based Neural Language Model
* A language model predicts the next word in the sequence based on the specific words that have come before it in the sequence.

## 1. Data Preparation


### 1.1 Load Text

In [1]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename,'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [2]:
raw_text = load_doc('rhyme.txt')
print(raw_text)

Sing a song of sixpence,
A pocket full of rye.
Four and twenty blackbirds,
Baked in a pie.
When the pie was opened
The birds began to sing;
Wasn ' t that a dainty dish,
To set before the king.
The king was in his counting house,
Counting out his money;
The queen was in the parlour,
Eating bread and honey.
The maid was in the garden,
Hanging out the clothes,
When down came a blackbird
And pecked off her nose.



### 1.2 Clean Text

In [3]:
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)

### 1.3 Create Sequences

In [4]:
# organize into sequences of characters

length = 10
sequences = list()
for i in range(length,len(raw_text)):
    # select sequence of tokens
    seq = raw_text[i-length:i+1]
    # store
    sequences.append(seq)
print('Total Sequences: %d' %len(sequences))

Total Sequences: 401


### 1.4 Save Sequences

In [5]:
# save tokens to file one dialog per line
def save_doc(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()

In [6]:
# save sequence to file
out_filename = 'char_sequences.txt'
save_doc(sequences,out_filename)

## 2. Train Language Model

### 2.1 Load Data

In [7]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename,'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

### 2.2 Encode Sequences
* The sequences of characters must be encoded as integers.

In [9]:
chars = sorted(list(set(raw_text)))
mapping = dict((c,i) for i, c in enumerate(chars))

In [10]:
sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)



In [11]:
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' %vocab_size)

Vocabulary Size: 38


### 2.3 Split Inputs and Output

In [13]:
from numpy import array
sequences = array(sequences)
X,y = sequences[:,:-1],sequences[:,-1]

In [16]:
from keras.utils import to_categorical
sequences = [to_categorical(x,num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y,num_classes=vocab_size)

### 2.5 Complete model

In [24]:
from numpy import array
from pickle import dump
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM


# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r' )
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# define the model
def define_model(X):
    model = Sequential()
    model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(vocab_size, activation= 'softmax' ))
    # compile model
    model.compile(loss= 'categorical_crossentropy' , optimizer= 'adam' , metrics=[ 'accuracy' ])
    # summarize defined model
    model.summary()
    plot_model(model, to_file= 'LSTMmodel.png' , show_shapes=True)
    return model


# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')


# integer encode sequences of characters
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))


sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)
    
    
# vocabulary size
vocab_size = len(mapping)
print( 'Vocabulary Size: %d' % vocab_size)
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

# define model
model = define_model(X)
# fit model
model.fit(X, y, epochs=100, verbose=2)
# save the model to file
model.save('LSTMmodel.h5')
# save the mapping
dump(mapping, open('mapping.pkl' , 'wb' ))

Vocabulary Size: 38
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 75)                34200     
_________________________________________________________________
dense_2 (Dense)              (None, 38)                2888      
Total params: 37,088
Trainable params: 37,088
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
13/13 - 0s - loss: 3.6161 - accuracy: 0.0599
Epoch 2/100
13/13 - 0s - loss: 3.5033 - accuracy: 0.1895
Epoch 3/100
13/13 - 0s - loss: 3.1646 - accuracy: 0.1945
Epoch 4/100
13/13 - 0s - loss: 3.0682 - accuracy: 0.1945
Epoch 5/100
13/13 - 0s - loss: 3.0129 - accuracy: 0.1945
Epoch 6/100
13/13 - 0s - loss: 2.9832 - accuracy: 0.1945
Epoch 7/100
13/13 - 0s - loss: 2.9735 - accuracy: 0.1945
Epoch 8/100
13/13 - 0s - loss: 2.9594 - accuracy: 0.1945
Epoch 9/100
13/13 - 0s - loss: 2.942

## 3. Generate Characters

In [26]:
from pickle import load
from keras.models import load_model
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# generate a sequecne of characters with a language model
def generate_seq(model,mapping,seq_length,seed_text,n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded],maxlen=seq_length,truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded,num_classes=len(mapping))
        encoded = encoded.reshape(1,encoded.shape[1],encoded.shape[2])
        # predict character
        yhat = model.predict_classes(encoded,verbose=0)
        # reverse map integer to character
        out_char = ''
        for char,index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += out_char
    return in_text

# load model
model = load_model('LSTMmodel.h5')

# load the mapping
mapping = load(open('mapping.pkl','rb'))

# test start of rhyme
print(generate_seq(model,mapping,10,'Sing a son',20))

# test mid-line
print(generate_seq(model,mapping,10,'king was i',20))

# test not in original
print(generate_seq(model,mapping,10,'hello worl',20))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Sing a song of sixpence, A poc
king was in his counting house
hello worls Teeen  heek moe  a
