# 推文生成器

## 导入使用的库

In [16]:
import numpy as np
import pandas as pd
import data_load_utils as util
from math import ceil

from importlib import reload
util = reload (util)

# for cpu and memory profiling
#%load_ext line_profiler
#%load_ext memory_profiler

In [17]:
tweets = util.filter_tweets_min_count(util.read_tweet_data('data/emojis_homemade.csv'), min_count=1000)

tweets['text'] = util.filter_text_for_handles(tweets['text'])

In [18]:
tweets.iloc[0,:]

text     RT [VID] 181023 - Foi adicionada a letra D no ...
emoji                                                    ©
Name: 0, dtype: object

In [19]:
tweets.shape

(445129, 2)

## 推文处理的思路
* 将每条推文进行分词转化成字符串，并且进行one-hot编码
* 由于占用大量内存，可用批处理的方式进行改进：用一个window（size：64），每条推文选取32个training sample，执行3次。

In [20]:
MAX_TWEET_LENGTH = 160
WINDOW_SIZE = 64
STEP = 3

samples_per_tweet = int(ceil((MAX_TWEET_LENGTH - WINDOW_SIZE) / STEP)) # 32
tweets_per_batch = 64
samples_per_batch = samples_per_tweet * tweets_per_batch # 2048

chars_univ, chars_univ_idx = util.get_universal_chars_list()

In [21]:
TRAIN_SIZE = 2**18 # 32,768  try 131072 = 2**17 for production
DEV_SIZE = 2**12   # 8192  try 8192 = 2**13 for production

n_train_batches = TRAIN_SIZE / tweets_per_batch
n_dev_batches = DEV_SIZE / tweets_per_batch

tweets_train = tweets.iloc[0:TRAIN_SIZE] # 8192 = 2**13
tweets_dev = tweets.iloc[TRAIN_SIZE:TRAIN_SIZE+DEV_SIZE] # 2048 = 2**11

In [22]:
#初始化train，dev集合
train_generator = util.convert_tweet_to_xy_generator(tweets_train, length=MAX_TWEET_LENGTH, \
                                                            window_size=WINDOW_SIZE,step=STEP, \
                                                            batch_size=tweets_per_batch)

dev_generator = util.convert_tweet_to_xy_generator(tweets_dev, length=MAX_TWEET_LENGTH, \
                                                          window_size=WINDOW_SIZE,step=STEP, \
                                                          batch_size=tweets_per_batch)

## LSTM模型

In [23]:
import keras
from keras import layers
from keras.models import Sequential
from keras import callbacks

In [24]:
model = keras.models.load_model("models/tweet_gen_model-0.776.hdf5") # 256 LSTM units, ~30 epochs training  

#model = keras.models.Sequential()
#model.add(layers.LSTM(256, input_shape=(WINDOW_SIZE, len(chars_univ)))) # was 128 units
#model.add(layers.Dense(len(chars_univ), activation='softmax'))

# loss function - targets are one-hot encoded
#optimizer = keras.optimizers.RMSprop(lr=0.001)
#model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## 通过character-by-character方法训练模型
* 计算下一个单词的概率分布情况
* 使用temperature对权重进行调整
* 按照重新分布的权重对下一个单词进行随机采样
* 加入新的单词到可用list

In [25]:
def sample (preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## 训练模型，生成text
* 在每个epoch采用不同的temperature

In [None]:
import random
import sys
#用于text generation的seed
n_seed_chars = 64
#重置学习率
model.optimizer.lr.assign(0.001)
#加入断点续训
checkpoint = callbacks.ModelCheckpoint(filepath='tweet_gen_model-{loss:.3f}.hdf5', 
                                       verbose=1, 
                                       save_best_only=True)

#训练60个epoch
for epoch in range (1, 60):
    print ('epoch', epoch)
    #训练模型
    model.fit_generator (train_generator,
                         steps_per_epoch=n_train_batches, # 64 x 32 = batches of 2048
                         epochs=1,
                         validation_data=dev_generator, 
                         validation_steps=n_dev_batches,
                         callbacks=[checkpoint],
                         verbose=1,
                         use_multiprocessing=True, # run the generator in a separate thread
                         )

    #随机选择text seed
    seed_tweet = tweets.iloc[random.randint(0, len(tweets))]
    seed_text = util.pad_text(seed_tweet['text'][0:n_seed_chars], n_seed_chars)
    generated_text = seed_text
    print ('--- Generating with seed: "' + generated_text + '"')

    #采用一系列的temperature
    for temperature in [0.3, 0.5, 0.8, 1.0]:
        generated_text = seed_text
        print ('--------- temperature:', temperature)
        sys.stdout.write(generated_text)

        for i in range (MAX_TWEET_LENGTH - n_seed_chars):
            #对生成的文本进行one-hot编码
            sampled = np.zeros((1, WINDOW_SIZE, len(chars_univ)))
            for t, char in enumerate (generated_text):
                sampled[0, t, chars_univ_idx[char]] = 1
            #使用模型进行预测
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars_univ[next_index]
            #生成文本
            generated_text += next_char
            generated_text = generated_text[1:]
            #打印文本
            sys.stdout.write(next_char)

        print ("\n")    

epoch 1
Epoch 1/1
  10/4096 [..............................] - ETA: 128:55:58 - loss: 0.9085

In [15]:
char_univ_idx

NameError: name 'char_univ_idx' is not defined