In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from utils import ProgressBar
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split

In [2]:
sentlength = 10
limitation = 10000

enlines = []
START = '<START>'
END = '<END>'

pb = ProgressBar(worksum=10000000)
pb.startjob()
num = 0
complete = 0
# 只取英文句子
with open('data/segmented_train_seg_by_word.txt') as fhdl:
    for line in fhdl:
        num += 1
        # 使用行号作为标注，奇数行为英文，偶数行为中文
        if num % 2 == 1:
            enline = line
        else:
            continue
        
        enlinesp = [START] + [i.lower() for i in enline.strip("\n").split()] + [END]
        # 设置一个最大长度提升demo速度
        if len(enlinesp) <= sentlength:
            enlines.append(enlinesp)
            
        if (num // 2) % 1000 == 0:
            complete += 1000
            pb.complete(1000)
        
        if len(enlines) >= limitation:
            pb.complete(10000000 - complete)
            break



In [3]:
word2ind = {}
ind2word = {}

def addchar(word2ind,ind2word,char):
    if char in word2ind:
        return 
    ind2word[len(word2ind)] = char
    word2ind[char] = len(word2ind)

In [4]:
specialchars = ['<pad>']

for word in specialchars:
    addchar(word2ind,ind2word, word)

for line in enlines:
    for word in line:
        addchar(word2ind,ind2word,word)

In [5]:
def create_seq_segment(sequences):
    seq_segments = list()
    for line in sequences:
        for i in range(1, len(line)):
            seq = line[0:i+1]
            seq_segments.append(seq)
    return seq_segments

In [6]:
seq_segments = create_seq_segment(enlines)

In [7]:
x = []
y = []
for segment in seq_segments:
    seg_ids = [word2ind.get(i) for i in segment]
    x.append(seg_ids)
    y.append(seg_ids[-1])

In [8]:
vocab = len(word2ind)
x_np = tf.keras.preprocessing.sequence.pad_sequences(x,padding='post',value=word2ind['<pad>'])
y_np = np.asarray(y)
y_np = to_categorical(y_np, num_classes=vocab)
x_tr, x_val, y_tr, y_val = train_test_split(x_np, y_np, test_size=0.1, random_state=1)

In [9]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=sentlength, trainable=True))
model.add(LSTM(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(x_np, y_np, epochs=20, verbose=2, validation_data=(x_val, y_val))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 50)            363950    
_________________________________________________________________
lstm (LSTM)                  (None, 150)               120600    
_________________________________________________________________
dense (Dense)                (None, 7279)              1099129   
Total params: 1,583,679
Trainable params: 1,583,679
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
2257/2257 - 68s - loss: 4.8086 - acc: 0.2746 - val_loss: 3.6073 - val_acc: 0.3825
Epoch 2/20
2257/2257 - 66s - loss: 3.1531 - acc: 0.4653 - val_loss: 2.5775 - val_acc: 0.5740
Epoch 3/20
2257/2257 - 65s - loss: 2.4608 - acc: 0.5857 - val_loss: 2.0572 - val_acc: 0.6536
Epoch 4/20
2257/2257 - 69s - loss: 2.0184 - acc: 0.6557 - val_loss: 1.6546 - val_acc: 0.7154

<tensorflow.python.keras.callbacks.History at 0x7f93160e5f10>

In [10]:
# generate a sequence of characters with a language model
def generate_seq(model, ind2word, max_seq_length):
    in_text = [word2ind[START]]
    new_word = None
    while new_word != word2ind[END] and len(in_text) < max_seq_length:
        encoded = pad_sequences([in_text], maxlen=max_seq_length, truncating='pre')
        word_probs = model.predict(encoded, verbose=0)[0]
        r = random.random()
        accumulator = .0
        
        while accumulator < r:
            new_word = np.argmax(word_probs)
            accumulator += word_probs[new_word]
            word_probs[new_word] = 0
            
        in_text.append(new_word.item())
    return ' '.join([ind2word[word] for word in in_text[1:]])

In [15]:
for i in range(10):
    sent = generate_seq(model, ind2word, sentlength)
    print(i, sent)

0 hundred keep inconsistencies keep such 15 wanted give dealt
1 actually clean easy pen hens large classroom oath friendship
2 1000 nobody double large stop kills sit speak garrison
3 without keep face freshman nobody bloody rubbish best squadron
4 tell 18 15 brilliant vodka 14 65 fault holiday
5 fifty keep nobody if missile 15 stop terrible necklace
6 18 springs words 18 keep hmm jury wooden guv
7 classes hens zip nobody clean weeks kingdom barely papassian
8 band saved neither 15 15 called large life continuous
9 fifty upon nobody nobody fifty mean nobody kills detect
