<a href="https://colab.research.google.com/github/hogch/masterproject_gan/blob/master/SeqGAN_headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generation using GAN

This notebook generates news headlines using the Machine Learning technology GAN (Generative Adversarial Networks).

## Import dependencies



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
%cd /content/drive/My\ Drive/Colab\ Notebooks/Masterproject
!ls

/content/drive/My Drive/Colab Notebooks/Masterproject
abcnews-date-text.csv  news-headlines.db  SeqGAN_headlines.ipynb
headlines.csv	       SeqGAN


**Import required modules**

In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

Using TensorFlow backend.


**Install required dependencies manually**

In [57]:
!pip install tflearn

Collecting tflearn
[?25l  Downloading https://files.pythonhosted.org/packages/16/ec/e9ce1b52e71f6dff3bd944f020cef7140779e783ab27512ea7c7275ddee5/tflearn-0.3.2.tar.gz (98kB)
[K    100% |████████████████████████████████| 102kB 2.7MB/s 
Building wheels for collected packages: tflearn
  Running setup.py bdist_wheel for tflearn ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/d0/f6/69/0ef3ee395aac2e5d15d89efd29a9a216f3c27767b43b72c006
Successfully built tflearn
Installing collected packages: tflearn
Successfully installed tflearn-0.3.2
Collecting highway.py
  Downloading https://files.pythonhosted.org/packages/e6/35/e5a7a40f476701591f7d177dbf1e59d3ed292ca30ca3d27634519afb1783/highway.py-0.2.2.tar.gz
Collecting websockets==3.4 (from highway.py)
[?25l  Downloading https://files.pythonhosted.org/packages/4f/3a/2c3a5b2c65179851e80d4acae30cffb2610a8740a8edb2afbeaa564283f8/websockets-3.4-cp36-cp36m-manylinux1_x86_64.whl (54kB)
[K    100% |███████████████████████████

## Load Dataset

In [5]:
df = pd.read_csv("headlines.csv", sep=',',index_col = "id")
print(df.shape)
df.head()

(1103665, 3)


  mask |= (ar1 == a)


Unnamed: 0_level_0,publish_date,text,fake
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,20030219,aba decides against community broadcasting lic...,0
2,20030219,act fire witnesses must be aware of defamation,0
3,20030219,a g calls for infrastructure protection summit,0
4,20030219,air nz staff in aust strike for pay rise,0
5,20030219,air nz strike to affect australian travellers,0


## Define Test-, Trainingset and Hyper-Parameter for the Discriminator

In [0]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['fake'], 
    test_size=0.2, 
    random_state=42
)

# Discriminator Hyper-Parameter
BATCH_SIZE = 64
VOCAB_SIZE = 5000 # 20
SEQ_LENGTH = 100 # 20
EMB_SIZE = 100
EMB_DIM = 64
FILTER_SIZES = [2,3]
NUM_CLASSES = 2
NUM_FILTERS = 50

## Initialize Discriminator and build model

In [88]:
discriminator = Discriminator(VOCAB_SIZE, SEQ_LENGTH, EMB_SIZE, 
                              FILTER_SIZES, NUM_CLASSES, NUM_FILTERS)

discriminator.build_model()

Tensor("cross_entropy_loss_1/add:0", shape=(), dtype=float32)


## Discriminator
model for classifying sequences (here headlines) as real or fake.
In this implementation the discriminative model uses following layers: 
1.   embedding layer
2.   convolution layer
3.   max-pooling layer
4.   softmax layer

In [0]:
class Discriminator:
  def __init__(self, vocab_size, seq_length, emb_size, filter_sizes, num_classes, num_filters):
    self.vocab_size = vocab_size
    self.emb_size = emb_size
    self.seq_length = seq_length
    self.filter_sizes = filter_sizes
    self.num_classes = num_classes
    self.num_filters = num_filters

    self.X_input = tf.placeholder(tf.int32, shape=[None, self.seq_length], name='X_input')
    self.y_input = tf.placeholder(tf.float32, shape=[None, self.num_classes], name='y_input')
    #self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
    
    # Keeping track of l2 regularization loss (optional)
    self.l2_reg_lambda = 0.0
    self.l2_loss = tf.constant(0.0)

  def build_model(self):
    self.embedding_layer = self.build_embedding_layer()
    self.convolution_maxpool_layer = self.build_convolution_maxpool_layer()
    self.scores, self.predictions = self.build_softmax_layer()
    
    self.calc_mean_cross_entropy_loss()
        
  def build_embedding_layer(self):
    with tf.device('cpu:0'), tf.name_scope('embedding_layer'):
      W_emb = tf.Variable(
          initial_value=tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0), 
          name='W'
      )
      emb_chars = tf.nn.embedding_lookup(W_emb, self.X_input)
      self.emb_chars_expand = tf.expand_dims(emb_chars, -1)
    
  def build_convolution_maxpool_layer(self):
    pooled_outputs = []
    for filter_size in self.filter_sizes:
      with tf.name_scope('conv-maxpool-%s' % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, self.emb_size, 1, self.num_filters]
        W_filters = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name='W')
        b = tf.Variable(tf.constant(0.1, shape=[self.num_filters]), name='b')
        conv = tf.nn.conv2d(
            input=self.emb_chars_expand, #input=self.embedding_layer
            filter=W_filters,
            strides=[1,1,1,1],
            padding='VALID',
            name='conv'
        )
        # Apply non-linearity - activation function
        activation = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
        # Maxpooling over outputs
        max_pooling = tf.nn.max_pool(
            value=activation,
            ksize=[1, self.seq_length-filter_size+1, 1, 1],
            strides=[1,1,1,1],
            padding='VALID',
            name='max_pooling'
        )
        pooled_outputs.append(max_pooling)
        
    concat_outputs = tf.concat(pooled_outputs, axis=3)
    self.final_output_length = self.num_filters * len(pooled_outputs) # or sum(self.num_filters)?
      
    return tf.reshape(concat_outputs, [-1, self.final_output_length])
        
  def build_softmax_layer(self):
    with tf.name_scope('softmax_output'):
      W_softmax = tf.Variable(
          tf.truncated_normal(
              [self.final_output_length, self.num_classes], 
              stddev=0.1
          ), name='W'
      )
      b_softmax = tf.Variable(tf.constant(0.1, shape=[self.num_classes]), name='b')
      
      #h_highway = self.highway(self.convolution_maxpool_layer, self.convolution_maxpool_layer.get_shape()[1], 1, 0)
      #h_dropout = tf.nn.dropout(h_highway, self.dropout_keep_prob)
      
      self.l2_loss += tf.nn.l2_loss(W_softmax)
      self.l2_loss += tf.nn.l2_loss(b_softmax)
      
      #self.scores = tf.nn.xw_plus_b(h_dropout, W_softmax, b_softmax, name='scores')
      self.scores = tf.matmul(self.convolution_maxpool_layer, W_softmax) + b_softmax
      predictions = tf.argmax(self.scores, 1, name="predictions")
      
    return self.scores, predictions
  
  def calc_mean_cross_entropy_loss(self):
    with tf.name_scope('cross_entropy_loss'):
      losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.y_input)
      self.loss = tf.reduce_mean(losses) + self.l2_reg_lambda * self.l2_loss


### Train Discriminator

In [0]:
    def train(self, X, y, nb_epochs, batch_size=32, learning_rate=.001):


        # Evaluate model
        correct_pred = tf.equal(self.predictions, tf.argmax(self.y_input, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_pred, 'float'))
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.y_input))

        with tf.name_scope('loss'):
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
        init = tf.initialize_all_variables()

        with tf.Session() as sess:
            sess.run(init)

            epoch_i = 0

            while epoch_i < nb_epochs:
                batch_i = 0
                batch_losses = []
                batch_accs = []

                for i in range(batch_size, X.shape[0], batch_size):
                    X_batch, y_batch = X[batch_i:i], y[batch_i:i]

                    sess.run(optimizer, feed_dict={
                        self.X_input: X_batch,
                        self.y_input: y_batch
                    })

                    loss, acc = sess.run([cost, accuracy], feed_dict={
                        self.X_input: X_batch,
                        self.y_input: y_batch
                    })

                    batch_accs.append(acc)
                    batch_losses.append(loss)

                    batch_i = i
                print('Epoch: {} loss: {:.6f} acc: {:.6f}'.format(epoch_i + 1, mean(batch_losses), mean(batch_accs)))

                epoch_i += 1