In [98]:
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras import layers
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [99]:

path = 'nyt/'  # path to dataset

all_headlines = []

for filename in os.listdir(path):
    if 'Article' in filename:
        headlines = pd.read_csv(path+filename)
        all_headlines.extend(list(headlines.headline.values))

all_headlines = [hline for hline in all_headlines if hline != 'Unknown']
print(all_headlines[:5])


['Finding an Expansive View  of a Forgotten People in Niger', 'And Now,  the Dreaded Trump Curse', 'Venezuela’s Descent Into Dictatorship', 'Stain Permeates Basketball Blue Blood', 'Taking Things for Granted']


In [100]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(all_headlines)
X_train = X_train.toarray()
X_train = tf.convert_to_tensor(X_train)

In [101]:
BUFFER_SIZE = len(X_train)
batch_size = 512
vocab_size = noise_dim = 10616 
max_length = 20    # Maximum number of words in a headline


train_dataset = tf.data.Dataset.from_tensor_slices(X_train).shuffle(BUFFER_SIZE).batch(batch_size)


In [102]:
def make_generator_model():
    model = tf.keras.Sequential()
    
    model.add(layers.Dense(256, use_bias=False, input_shape=(vocab_size,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    model.add(layers.Dense(512, use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    model.add(layers.Dense(1024, use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    
    output_size = vocab_size
    model.add(layers.Dense(output_size, activation='tanh'))

    return model

generator = make_generator_model()

In [103]:
def make_discriminator_model():
    model = tf.keras.Sequential()
    
    model.add(layers.InputLayer(input_shape=(vocab_size,)))
    model.add(layers.Flatten())
    model.add(layers.Dense(512))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(256))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(128))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(1, activation='tanh'))
    
    return model

discriminator = make_discriminator_model()

In [104]:
# Loss functions and optimizers

cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)


In [105]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)


In [106]:
epochs = 5
num_examples_to_generate = 5

seed = tf.random.normal([num_examples_to_generate, noise_dim])

In [107]:
@tf.function  # for compiling
def train_step(txt):
    noise = tf.random.normal([batch_size, noise_dim])
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      generated_txt = generator(noise, training=True)

      real_output = discriminator(txt, training=True)
      fake_output = discriminator(generated_txt, training=True)

      gen_loss = generator_loss(fake_output)
      disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))


In [108]:
def generate_and_save_texts(model, epoch, test_input):
    predictions = model(test_input, training=False)
    """
    Uncomment below to get all predictions
    
    """

    # for prediction in predictions:
    #     top_indices = tf.math.top_k(prediction, k=10).indices.numpy()
    #     top_words = [tfidf.get_feature_names_out()[i] for i in top_indices]
    #     print(" ".join(top_words).title())

    """
    Or keep this to only get predictions found realistic by the discriminator
    
    """
    
    for pair in zip(predictions, discriminator(predictions)):
        if pair[1] > 0:
            top_indices = tf.math.top_k(pair[0], k=10).indices.numpy()
            top_words = [tfidf.get_feature_names_out()[i] for i in top_indices]
            print(" ".join(top_words).title())



def generate_last(model, epoch, test_input):
    predictions = model(test_input, training=False)
    for prediction in predictions:
        top_indices = tf.math.top_k(prediction, k=10).indices.numpy()
        top_words = [tfidf.get_feature_names_out()[i] for i in top_indices]
        print(" ".join(top_words).title())




In [109]:
def train(dataset, epochs):
  for epoch in range(epochs):
    start = time.time()

    for text in dataset:
      train_step(text)

    generate_and_save_texts(generator,
                             epoch + 1,
                             seed)

    # Save the model every 5 epochs
    if (epoch + 1) % 5 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print(f'Epoch {epoch + 1} completed in {time.time()-start:.2f} secs. \n')

  # Generate after the final epoch
  generate_last(generator,
                epochs,
                seed)


In [110]:
train(train_dataset, epochs)


Epoch 1 completed in 18.11 secs. 

Epoch 2 completed in 12.03 secs. 

Epoch 3 completed in 10.62 secs. 

Epoch 4 completed in 10.41 secs. 

Epoch 5 completed in 11.20 secs. 

Explained Vuong Gulf Pilot Gouging Saying Diary Roseanne Lin Loss
Rebecca Roosevelt Later Duterte Religious Simplicity Feather Plucky Jousting Besties
Dividing Sublet Mishap Trigger Alike Newest Valentine Difference Believed Brief
Plot Bee Sarin Buyer Coelacanth Enhanced 20 Kipling Elderly Feminism
Greenpeace Milwaukee Dismantling Task Ability He Verse Draw Vendor Anything
