<a href="https://colab.research.google.com/github/hogch/masterproject_gan/blob/master/SeqGAN_headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation using SeqGAN

This notebook generates news headlines using the Machine Learning technology GAN (Generative Adversarial Networks).

## Import dependencies



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/Masterproject
!ls

/content/drive/My Drive/Colab Notebooks/Masterproject
abcnews-date-text.csv  screenshots
Graph		       SeqGAN
headlines.csv	       SeqGAN_headlines_dataloading_experiments.ipynb
news-headlines.db      SeqGAN_headlines.ipynb
results


**Install required dependencies manually**

In [3]:
!pip install tflearn
!pip install tqdm
!pip install --force https://github.com/chengs/tqdm/archive/colab.zip
!pip install tensorboardcolab

Collecting https://github.com/chengs/tqdm/archive/colab.zip
  Downloading https://github.com/chengs/tqdm/archive/colab.zip
[K     | 481kB 1.3MB/s
Building wheels for collected packages: tqdm
  Running setup.py bdist_wheel for tqdm ... [?25l- \ done
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-u67tyeh3/wheels/41/18/ee/d5dd158441b27965855b1bbae03fa2d8a91fe645c01b419896
Successfully built tqdm
[31mspacy 2.0.18 has requirement numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.[0m
[31mfeaturetools 0.4.1 has requirement pandas>=0.23.0, but you'll have pandas 0.22.0 which is incompatible.[0m
Installing collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.28.1


**Import required modules**

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import datetime

from keras.preprocessing.text import Tokenizer
from tflearn.data_utils import pad_sequences, to_categorical
from tensorflow.contrib import slim
from tqdm import tqdm, tnrange

from tensorboardcolab import *

Using TensorFlow backend.


In [5]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Load Dataset

In [6]:
df_real = pd.read_csv('headlines.csv', sep=',', usecols=['text', 'fake'])
df_real = df_real.sample(frac=1)
df_real = df_real[:50000]
print(df_real.shape)
df_real.head()

(50000, 2)


Unnamed: 0,text,fake
64917,qld govt declines powerline green plea,0
880645,us urges china to account for tiananmen crackdown,0
17698,tourism delegates to head north west,0
598829,tough laws dont deter se dogs,0
423240,rudd applauds china economic package,0


In [7]:
df_fake = pd.DataFrame(columns=['text', 'fake'])
print(df_fake.shape)
df_fake.head()

(0, 2)


Unnamed: 0,text,fake


## Define Test-, Trainingset and Hyper-Parameter

In [0]:
# General Hyper-Parameter
BATCH_SIZE = 32
VOCAB_SIZE = 2000 # default value (later dynamic assigned with length of word index)
SEQ_LENGTH = 8 # average sequence length of the given sentences in the dataset
TRAINING_SPLIT = 0.2
TOTAL_EPOCH = 200

# Discriminator Hyper-Parameter
D_PRETRAIN_EPOCHS = 50
D_EMB_SIZE = 100
D_EMB_DIM = 64
D_FILTER_SIZES = [2,3]
D_NUM_CLASSES = 2
D_NUM_FILTERS = 50

# Generator Hyper-Parameter
G_PRETRAIN_EPOCHS = 1000
G_START_TOKEN = 0
G_EMB_SIZE = 100
G_EMB_DIM = 32
G_HIDDEN_DIM = 32 # hidden state dimension of lstm cell

In [0]:
import random

def get_datasets(texts, labels, tokenizer=None):
  if tokenizer is None:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)

  sequences = tokenizer.texts_to_sequences(texts)
  word_index = tokenizer.word_index
  text_seq = pad_sequences(sequences, maxlen=SEQ_LENGTH)
    
  labels = np.asarray(labels)

  indices = np.arange(text_seq.shape[0])
  np.random.shuffle(indices)
  text_seq = text_seq[indices]
  labels = labels[indices]
  test_size = int(TRAINING_SPLIT * text_seq.shape[0])
  
  X_train = text_seq[:test_size]
  y_train = to_categorical(labels[:test_size], 2)
  X_test = text_seq[test_size:]
  y_test = labels[test_size:]
  
  return tokenizer, word_index, X_train, y_train, X_test, y_test
  
def load_fake_data():
  texts = []
  labels = []
  
  global df_fake
  df_fake = df_fake.sample(frac=1)
  
  for row in zip(df_fake['text'], df_fake['fake']):
    texts.append(row[0].strip())
    labels.append(row[1])

  tokenizer, word_index, X_train, y_train, X_test, y_test = get_datasets(texts, labels)
  VOCAB_SIZE = len(word_index) # number of different words in the dataset
  
  return tokenizer, word_index, X_train, y_train, X_test, y_test

def load_real_data():
  texts = []
  labels = []
  
  global df_real
  df_real = df_real.sample(frac=1)
  
  for row in zip(df_real['text'], df_real['fake']):
    texts.append(row[0].strip())
    labels.append(row[1])

  tokenizer, word_index, X_train, y_train, X_test, y_test = get_datasets(texts, labels)
  VOCAB_SIZE = len(word_index) # number of different words in the dataset
  
  return tokenizer, word_index, X_train, y_train, X_test, y_test

def load_mixed_data():
  texts = []
  labels = []
  
  df = pd.concat([df_real, df_fake])
  df = df.sample(frac=1)

  for row in zip(df['text'], df['fake']):
    texts.append(row[0].strip())
    labels.append(row[1])

  tokenizer, word_index, X_train, y_train, X_test, y_test = get_datasets(texts, labels)
  VOCAB_SIZE = len(word_index) # number of different words in the dataset
  
  return tokenizer, word_index, X_train, y_train, X_test, y_test


## Discriminator
model for classifying sequences (here headlines) as real or fake.
In this implementation the discriminative model uses following layers: 
1.   embedding layer
2.   convolution layer
3.   max-pooling layer
4.   softmax layer

In [0]:
class Discriminator:
  def __init__(self, vocab_size, seq_length, emb_size, filter_sizes, num_classes, num_filters, learning_rate):
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.seq_length = seq_length
    self.filter_sizes = filter_sizes
    self.num_classes = num_classes
    self.num_filters = num_filters
    self.learning_rate = learning_rate

    self.X_input = tf.placeholder(tf.int32, shape=[None, self.seq_length], name='X_input')
    self.y_input = tf.placeholder(tf.int32, shape=[None, self.num_classes], name='y_input')
    self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
    
    # Keeping track of l2 regularization loss (optional)
    self.l2_reg_lambda = 0.2
    self.l2_loss = tf.constant(0.0)

  def build_model(self):
    with tf.variable_scope('discriminator', reuse=tf.AUTO_REUSE):
      self.embedding_layer = self.build_embedding_layer()
      self.convolution_maxpool_layer = self.build_convolution_maxpool_layer()
      self.scores, self.predictions = self.build_softmax_layer()

      self.calc_mean_cross_entropy_loss()
      self.calc_accuracy_and_cost()

      self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
      
      self.d_summary = tf.summary.merge([
          tf.summary.scalar('d_loss', self.loss),
          tf.summary.scalar('d_acc', self.accuracy)
      ])
        
  def build_embedding_layer(self):
    with tf.device('gpu:0'), tf.name_scope('embedding_layer'):
      W_emb = tf.Variable(
          initial_value=tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0), 
          name='W'
      )
      emb_chars = tf.nn.embedding_lookup(W_emb, self.X_input)
      self.emb_chars_expand = tf.expand_dims(emb_chars, -1)
    
  def build_convolution_maxpool_layer(self):
    pooled_outputs = []
    for filter_size in self.filter_sizes:
      with tf.name_scope('conv-maxpool-%s' % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, self.emb_size, 1, self.num_filters]
        W_filters = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
        b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name='b')
        conv = tf.nn.conv2d(
            input=self.emb_chars_expand,
            filter=W_filters,
            strides=[1,1,1,1],
            padding='VALID',
            name='conv'
        )
        # Apply non-linearity - activation function
        activation = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
        # Maxpooling over outputs
        max_pooling = tf.nn.max_pool(
            value=activation,
            ksize=[1, self.seq_length-filter_size+1, 1, 1],
            strides=[1,1,1,1],
            padding='VALID',
            name='max_pooling'
        )
        pooled_outputs.append(max_pooling)
              
    # combine all the pooled features
    self.num_filter_total = self.num_filters * len(self.filter_sizes)
    h_pool = tf.concat(pooled_outputs, axis=3)
    return tf.reshape(h_pool, [-1, self.num_filter_total])
        
  def build_softmax_layer(self): 
    with tf.name_scope('highway'):
      self.h_highway = self.highway(
          self.convolution_maxpool_layer, self.convolution_maxpool_layer.get_shape()[1], 1, 0
      )

    with tf.name_scope('dropout'):
      self.h_drop = tf.nn.dropout(self.h_highway, self.dropout_keep_prob)
      
    with tf.name_scope('softmax_output'):
      W_softmax = tf.Variable(
          tf.truncated_normal(
              [self.num_filter_total, self.num_classes], 
              stddev=0.1
          ), name='W_softmax'
      )
      b_softmax = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name='b_softmax')
            
      self.l2_loss += tf.nn.l2_loss(W_softmax)
      self.l2_loss += tf.nn.l2_loss(b_softmax)
      
      self.scores = tf.nn.xw_plus_b(self.h_drop, W_softmax, b_softmax, name='scores')
      #self.scores = tf.matmul(self.convolution_maxpool_layer, W_softmax) + b_softmax
      self.truth_prob = tf.nn.softmax(self.scores)
      predictions = tf.argmax(self.scores, 1, name="predictions")
      
    return self.scores, predictions
  
  def calc_mean_cross_entropy_loss(self):
    with tf.name_scope('mse_loss'):
      losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.y_input)
      self.loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss
      
  def calc_accuracy_and_cost(self):
    with tf.name_scope('accuracy'):
      correct_predictions = tf.equal(self.predictions, tf.argmax(self.y_input, 1))
      self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'), name='accuracy')
      self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.scores, labels=self.y_input))
      
  def highway(self, input_, size, num_layers=1, bias=-2.0, f=tf.nn.relu):
    """Highway Network (cf. http://arxiv.org/abs/1505.00387).
    t = sigmoid(Wy + b)
    z = t * g(Wy + b) + (1 - t) * y
    where g is nonlinearity, t is transform gate, and (1 - t) is carry gate.
    """
    with tf.variable_scope('highway'):
      size = int(size)
      for idx in range(num_layers):
        g = f(slim.fully_connected(input_, size, scope='highway_lin_%d' % idx, activation_fn=None))
        t = tf.sigmoid(slim.fully_connected(input_, size, scope='highway_gate_%d' % idx, activation_fn=None) + bias)

        output = t * g + (1. - t) * input_
        input_ = output
        
    return output
  
  def get_truth_prob(self, sess, X):
    feed_dict = { self.X_input: X, self.dropout_keep_prob: 1.0 }
        
    return sess.run(self.truth_prob, feed_dict=feed_dict)
        
  def train(self, sess, X, y, dropout):
    sess.run(tf.global_variables_initializer())
    feed_dict = {
        self.X_input: X,
        self.y_input: y,
        self.dropout_keep_prob: dropout
    }
    _, summary = sess.run([self.optimizer, self.d_summary], feed_dict=feed_dict)
    
    return summary

## Build Discriminator model and train model

In [0]:
tf.reset_default_graph()

discriminator = Discriminator(VOCAB_SIZE, SEQ_LENGTH, D_EMB_SIZE, D_FILTER_SIZES, 
                              D_NUM_CLASSES, D_NUM_FILTERS, 0.001)
discriminator.build_model()

## Generator

LSTM with Reinforcement Learning for sequence generation.

In [0]:
from tensorflow.contrib import rnn, seq2seq, slim
from __future__ import print_function

class Generator:
  def __init__(self, batch_size, seq_length, vocab_size, emb_size, emb_dim, 
               hidden_dim, start_token, learning_rate):
    self.batch_size = batch_size
    self.seq_length = seq_length
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.emb_dim = emb_dim
    self.hidden_dim = hidden_dim
    self.start_token = start_token
    self.learning_rate = learning_rate
    self.grad_clip = 5.0
    
    self.given_tokens = tf.placeholder(tf.int32, shape=[self.batch_size, self.seq_length], name='given_tokens')
    self.start_tokens = tf.Variable(tf.tile([self.start_token], [self.batch_size]), name='start_tokens')
    self.decoder_lengths = tf.placeholder(tf.int32, shape=[self.batch_size], name='decoder_lengths')
    
    with tf.variable_scope('generator', reuse=tf.AUTO_REUSE):
      self.g_rnn = rnn.LSTMCell(hidden_dim)
      self.g_embeddings = tf.Variable(tf.random_normal([self.vocab_size, self.emb_dim], stddev=0.1)) # init embeddings matrix
      self.g_embedding_fn = tf.nn.embedding_lookup(self.g_embeddings, self.given_tokens)
      
      self.decision_W = tf.Variable(tf.random_normal([self.hidden_dim, self.vocab_size]), name='decision_W')
      self.decision_b = tf.Variable(tf.zeros([self.vocab_size]), name='decision_b')
      
      self.output_ids = []
      self.output_probs = []
            
      for i in range(self.seq_length+1):
        self.decoder_lengths = np.int32(np.ones((self.batch_size), dtype=int) * self.seq_length)
        helper = seq2seq.TrainingHelper(self.g_embedding_fn, self.decoder_lengths, time_major=False)
        decoder = seq2seq.BasicDecoder(
            cell=self.g_rnn, helper=helper, 
            initial_state=self.g_rnn.zero_state(self.batch_size, tf.float32)
        )
        # https://gist.github.com/higepon/eb81ba0f6663a57ff1908442ce753084
        # Dynamic decoding
        # final_outputs: rnn_output=list of RNN state, sample_id=list of argmax of rnn_output
        # final_state: list of final state of RNN on decode process
        # final_seq_lengths: list of each decoded sequence
        final_outputs, final_state, final_seq_lengths = \
            seq2seq.dynamic_decode(decoder)

        self.output_ids.append(final_outputs.sample_id)
        self.output_probs.append(tf.nn.softmax(
            tf.tensordot(final_outputs.rnn_output, self.decision_W, axes=[[2],[0]])
            + self.decision_b[None,None,:])
        )
    
      # PRETRAIN
      # all tokens from given sequence
      logit = final_outputs.rnn_output
      self.pretrain_loss = tf.reduce_mean(
          tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.given_tokens, logits=logit)
      )
      self.pretrain_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
      self.pretrain_op = slim.learning.create_train_op(
          self.pretrain_loss, self.pretrain_optimizer, clip_gradient_norm=5.0
      )
      self.pretrain_summary = tf.summary.scalar('g_pretrain_loss', self.pretrain_loss)

      # reinforcement learning
      self.rewards = tf.placeholder(tf.float32, shape=[self.batch_size, self.seq_length], name='rewards')
      g_seq = self.output_ids[self.seq_length] # follow the generated one
      g_prob = self.output_probs[self.seq_length]
      g_loss = -tf.reduce_mean(
          tf.reduce_sum(
              tf.one_hot(g_seq, self.vocab_size) * 
              tf.log(tf.clip_by_value(g_prob, 1e-8, 1-1e-8)), -1
          ) * self.rewards
      )
      g_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
      self.g_op = slim.learning.create_train_op(g_loss, g_optimizer, clip_gradient_norm=5.0)
      self.g_summary = tf.summary.merge([
          tf.summary.scalar('g_loss', g_loss),
          tf.summary.scalar('g_reward', tf.reduce_mean(self.rewards))
      ])
      self.image_summary = tf.summary.merge([
          tf.summary.image('real_samples', tf.expand_dims(
              tf.one_hot(self.given_tokens, self.vocab_size), -1)
          ),
          tf.summary.image('fake_samples', tf.expand_dims(
              tf.one_hot(self.output_ids[0], self.vocab_size), -1)
          )
      ])
  
  def generate(self, sess, given_tokens):
    feed_dict = { self.given_tokens: given_tokens }
    
    return sess.run(self.output_ids[0], feed_dict=feed_dict)
      
  def rollout(self, sess, given_tokens, keep_steps=0, with_probs=False):
    feed_dict = { self.given_tokens: given_tokens }
    if with_probs:
      output_tensors = [self.output_ids[keep_steps], self.output_probs[keep_steps]]
    else:
      output_tensors = self.output_ids[keep_steps]
      
    return sess.run(output_tensors, feed_dict=feed_dict)
  
  def get_reward(self, sess, given_tokens, rollout_num, dis):
    rewards = []
    for i in range(rollout_num):
      for keep_num in range(1, self.seq_length+1):    
        # Markov Chain Sample / Monte Carlo Sample??
        mc_sample = self.rollout(sess, given_tokens, keep_steps=keep_num)
        truth_prob = dis.get_truth_prob(sess, mc_sample)
        ypred = np.array([item[1] for item in truth_prob])
        if i == 0:
          rewards.append(ypred)
        else:
          rewards[keep_num-1] += ypred
        
    rewards = np.transpose(np.array(rewards)) / (1.0 * rollout_num)
    
    return rewards
  
  def pretrain(self, sess, given_tokens):
    feed_dict = { self.given_tokens: given_tokens }
    sess.run(self.pretrain_op, feed_dict=feed_dict)
    _, summary = sess.run([self.pretrain_op, self.pretrain_summary], feed_dict=feed_dict)
    
    return summary
  
  def train(self, sess, given_tokens, rewards):
    feed_dict = { self.given_tokens: given_tokens, self.rewards: rewards }
    _, summary = sess.run([self.g_op, self.g_summary], feed_dict=feed_dict)
    
    return summary
    

## Build Generator model

In [0]:
generator = Generator(BATCH_SIZE, SEQ_LENGTH, VOCAB_SIZE, G_EMB_SIZE, 
                      G_EMB_DIM, G_HIDDEN_DIM, G_START_TOKEN, 0.0001)

## Start Adversarial Training

In [14]:

tbc=TensorBoardColab()

LOG_DIR = '/tmp/log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

Wait for 8 seconds...
TensorBoard link:
https://a003a35f.ngrok.io


In [0]:
def get_text(tokenizer, samples):
  reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
  
  # Function takes a tokenized sentence and returns the words
  def sequence_to_text(samples):
      # Looking up words in dictionary
      words = [reverse_word_map.get(letter) for letter in samples]
      return words

  # Creating texts 
  samples_text = list(map(sequence_to_text, samples))
  return samples_text

def add_fake_samples(tokenizer, samples):
  samples_text = get_text(tokenizer, samples)
  
  headlines = []
  for item in samples_text:
    headline = ''
    for word in item:
      headline += '%s ' % word

    global df_fake
    df_fake = df_fake.append({'text' : headline , 'fake' : 1} , ignore_index=True)


In [17]:
with tf.Session() as sess:
  print('start GAN training at', datetime.datetime.now())
  writer = tbc.get_writer()
  sess.run(tf.global_variables_initializer())
    
  """# PRETRAINING GENERATOR
  for epoch in range(G_PRETRAIN_EPOCHS):
    # seq_length -1 und von 1 anfangen
    tokenizer, word_index, X_train, y_train, X_test, y_test = load_mixed_data()
    summary = generator.pretrain(sess, X_train[:BATCH_SIZE])
    writer.add_summary(summary, epoch)
    
  # PRETRAIN DISCRIMINATOR
  for eoch in range(D_PRETRAIN_EPOCHS):
    fake_samples = generator.generator(sess)
    translate_samples(tokenizer, fake_samples, df_fake)
    tokenizer, word_index, X_train, y_train, X_test, y_test = load_data()
    
    for _ in range(3):
      discriminator.train(sess, X_train, y_train, 5, BATCH_SIZE, .001)"""

  # ADVERSARIAL TRAINING
  for epoch in tnrange(TOTAL_EPOCH, desc='gan_epoch_loop'):
    tokenizer, word_index, X_train, y_train, X_test, y_test = load_real_data()
    # train generator for one step
    for it in range(1):
      fake_samples = generator.generate(sess, X_train[:BATCH_SIZE])
      rewards = generator.get_reward(sess, fake_samples, 16, discriminator)
      summary = generator.train(sess, fake_samples, rewards)
    
    writer.add_summary(summary, epoch)
    
    for _ in tnrange(5, desc='gen_train_loop'):
      tokenizer, word_index, X_train, y_train, X_test, y_test = load_real_data()
      fake_samples = generator.generate(sess, X_train[:BATCH_SIZE])
      add_fake_samples(tokenizer, fake_samples)
      
      for _ in tnrange(3, desc='dis_train_loop'):
        tokenizer, word_index, X_train, y_train, X_test, y_test = load_mixed_data()
        summary = discriminator.train(sess, X_train[:BATCH_SIZE], y_train[:BATCH_SIZE], 0.4)
        
    print(df_fake.tail(2))
    writer.add_summary(summary, epoch)
    
    """tokenizer, word_index, X_real_train, y_real_train, X_real_test, y_real_test = load_real_data()
    summary = sess.run(generator.image_summary, feed_dict={ generator.given_tokens: X_real_train[:BATCH_SIZE] })
    writer.add_summary(summary, epoch)"""
    
  print('finish GAN training at', datetime.datetime.now())

start GAN training at 2018-12-20 11:32:46.534621


                                          text fake
158  None None with on on not council council     1
159  None man us over man None govt interview     1


KeyboardInterrupt: ignored

In [0]:
from pathlib import Path

print(df_fake.tail())

count = 1
saved = False
while saved != True:
  filename = 'generated_headlines_%d.csv' % count
  filepath = Path('./results/%s' % filename)
  
  if filepath.exists():
    count = count+1
  else:
    df_fake.to_csv(filepath, sep='\t', encoding='utf-8')
    saved = True