In [1]:
from __future__ import print_function, division

from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling1D, Conv1D
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.utils import to_categorical

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import random
import sys
import numpy as np
import pandas as pd
import re

Using TensorFlow backend.


In [2]:
noise_len = 20
num_words = 8

In [3]:
def process_data(num_words):
    X_train = pd.read_csv("/Users/isaanca/MachineLearning/fall/better_quest_data.csv", engine="python")
    X_train = X_train.rename(index=str, columns={
        "TITLE of your Quest Project (for Program)": "Title", 
        "DISCIPLINE(S) of YOUR QUEST (for Program)": "Disciplines"
    })
    X_train = [i for i in X_train["Title"]]
    
    # for the purpose of this model, I am disregarding the capitalization of titles
    X_train = [i.lower() for i in X_train]
    
    word_vector = CountVectorizer(token_pattern=r"((\w|'|’)+)", strip_accents="ascii")
    
    term_freq = word_vector.fit_transform(X_train)
    word_index = word_vector.vocabulary_
    word_index = {key[0]:value for key, value in word_index.items()}
    word_index["PADDING"] = len(word_index)
    word_index["END"] = len(word_index)
    num_classes = len(word_index)
    
    # the reverse index allows the numbers to be converted back into words
    reverse_word_index = {value: key for key, value in word_index.items()}
    
    # splits the words, using the same token pattern as the count vectorizer
    def split_words(title):
        return [i[0] for i in re.findall(r"((\w|'|’)+)", title.lower())]

    def pad_data(data, cutoff):
        new_X_train = data
        for i in range(len(new_X_train)):
            title = new_X_train[i]
            title = title[0:(cutoff-1)]
            while len(title) < (cutoff-1):
                title.append("PADDING")
            title.append("END")
            new_X_train[i] = title
        return new_X_train

    def index_words(word_list, word_index):
        return [word_index[i] for i in word_list]

    X_train = list(map(split_words, X_train))
    X_train = pad_data(X_train, num_words)
    X_train = [index_words(i, word_index) for i in X_train]
    
    X_train = np.array([np.transpose(to_categorical(X_train, num_classes=num_classes)[i]) for i in range(len(X_train))])
    
    return(X_train, reverse_word_index, num_classes)

In [4]:
X_train, vocab, num_classes = process_data(num_words)
# print(test)
# print(X_train)

print(len(X_train)) # number of data points/Quest titles
print(len(X_train[0])) # number of words in vocabulary
print(len(X_train[0][0])) # number of words per title

339
847
8


In [5]:
def build_discriminator(num_classes, num_words):
    '''
    Put together a CNN that will return a single confidence output.
    
    returns: the model object
    '''
    
    # dimensions: rows = list of words, columns/channels = # of words per title
    
    model = Sequential()
    model.add(Conv1D(32, kernel_size=num_words, strides=2, input_shape=(num_classes, num_words), padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(Conv1D(64, kernel_size=num_words, strides=2, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Conv1D(128, kernel_size=num_words, strides=2, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Conv1D(256, kernel_size=num_words, strides=1, padding="same"))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    return model

In [6]:
def build_generator(num_classes, num_words):
    '''
    Put together a model that takes in one-dimensional noise and outputs two-dimensional
    data representing a three-word phrase.
    
    returns: the model object
    '''
    
    # note to self: trailing commas are used for single-element tuples
    noise_shape = (noise_len,)

    model = Sequential()
    model.add(Dense(num_classes * num_words, activation="relu", input_shape=noise_shape))
    model.add(Reshape((num_classes, num_words)))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Conv1D(128, kernel_size=num_words, padding="same"))
    model.add(Activation("relu"))
    model.add(BatchNormalization(momentum=0.8)) 
    model.add(Conv1D(64, kernel_size=num_words, padding="same"))
    model.add(Activation("relu"))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Conv1D(num_words, kernel_size=num_words, padding="same"))
    model.add(Activation("tanh"))

    return model

In [7]:
def build_combined(num_classes, num_words):
    '''
    Puts together a model that combines the discriminator and generator models.
    
    returns: the generator, discriminator, and combined model objects
    '''
    
    optimizer = Adam(0.0002, 0.5)

    # Build and compile the discriminator
    discriminator = build_discriminator(num_classes, num_words)
    discriminator.compile(loss='binary_crossentropy', 
                          optimizer=optimizer,
                          metrics=['accuracy'])


    # Build and compile the generator
    generator = build_generator(num_classes, num_words)
    generator.compile(loss='binary_crossentropy', optimizer=optimizer)

    # The generator takes noise as input and generates images
    noise = Input(shape=(noise_len,))
    title = generator(noise)
    
    
    # For the combined model we will only train the generator
    discriminator.trainable = False

    # The discriminator takes generated images as input and determines validity
    valid = discriminator(title)

    # The combined model  (stacked generator and discriminator) takes
    # noise as input => generates images => determines validity 
    combined = Model(inputs=noise, outputs=valid)
    combined.compile(loss='binary_crossentropy', optimizer=optimizer)
    return generator, discriminator, combined

In [8]:
def save_titles(generator, epoch, vocab):
    '''
    Has the generator create and save new Quest titles.
    
    inputs:
        generator: the generator model object returned by build_combined
        epoch: the epoch number (but can be anything that can be represented as a string)
        vocab: the mapping of numbers to words
    
    returns: None
    '''
    titles = 1
    
    noise = np.random.normal(0, 1, (titles, noise_len))
    gen_title = generator.predict(noise)
    
    # chooses the word with the highest weight
    gen_title = [np.argmax([j[i] for j in gen_title[0]]) for i in range(8)]
    
    # map words to numbers
    gen_title = " ".join([vocab[i] for i in gen_title])
    
    file=open('titles_real_data/titles_{}.txt'.format(epoch),"w+")
    file.write(gen_title)
    print(gen_title)
    file.close()

In [9]:
def train(generator, discriminator, combined, data, vocab, epochs, batch_size=128, save_interval=50):
    '''
    Trains all model objects
    
    generator: the generator model object returned by build_combined
    discriminator: the discriminator model object returned by build_combined
    combined: the combined model object returned by build_combined
    epochs: integer, the number of epochs to train for
    batch_size: integer, the number of training samples to use at a time
    save_interval: integer, will generate and save images when the current epoch % save_interval is 0
    
    returns: None
    '''

    # Load the dataset
    X_train = data

    half_batch = int(batch_size / 2)

    for epoch in range(epochs):

        # ---------------------
        #  Train Discriminator
        # ---------------------

        # Select a random half batch
        idx = np.random.randint(0, X_train.shape[0], half_batch)
        titles = X_train[idx]

        # Sample noise and generate a half batch of new titles
        noise = np.random.normal(0, 1, (half_batch, noise_len))
        gen_titles = generator.predict(noise)

        # Train the discriminator (real classified as ones and generated as zeros)
        d_loss_real = discriminator.train_on_batch(titles, np.ones((half_batch, 1)))
        d_loss_fake = discriminator.train_on_batch(gen_titles, np.zeros((half_batch, 1)))

        # ---------------------
        #  Train Generator
        # ---------------------

        noise = np.random.normal(0, 1, (batch_size, noise_len))
        # Train the generator (wants discriminator to mistake titles as real)
        g_loss = combined.train_on_batch(noise, np.ones((batch_size, 1)))
           
        # If at save interval => save generated image samples and plot progress
        if epoch % save_interval == 0:
            # Plot the progress
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
            print("{} [D loss: {}, acc.: {:.2%}] [G loss: {}]".format(epoch, d_loss[0], d_loss[1], g_loss))
            save_titles(generator, epoch, vocab)

In [10]:
generator, discriminator, combined = build_combined(num_classes, num_words)

In [12]:
train(generator, discriminator, combined, X_train, vocab, epochs=4001, batch_size=32, save_interval=200)

0 [D loss: 0.6125308871269226, acc.: 68.75%] [G loss: 1.0838741064071655]
sneakers boom sandpiper personal choreography hyperplasia rebuilding neuroscience
200 [D loss: 0.005813055671751499, acc.: 100.00%] [G loss: 7.390000343322754]
algorithms world world airship ableton adventures PADDING END
400 [D loss: 4.2221604417136405e-06, acc.: 100.00%] [G loss: 16.047252655029297]
nation note controlled you PADDING PADDING PADDING END
600 [D loss: 0.007651554420590401, acc.: 100.00%] [G loss: 14.232666969299316]
END zer0 PADDING PADDING PADDING PADDING PADDING END
800 [D loss: 0.02080383338034153, acc.: 100.00%] [G loss: 8.97913932800293]
END youth youtube zer0 PADDING PADDING PADDING END
1000 [D loss: 3.171868956997059e-05, acc.: 100.00%] [G loss: 16.11809539794922]
classical architecture clinic PADDING replacement PADDING PADDING END
1200 [D loss: 3.914362907409668, acc.: 53.12%] [G loss: 16.11809539794922]
mom's youtube modern gap PADDING PADDING PADDING END
1400 [D loss: 2.826281706802547

In [None]:
# okay so my model really likes moms and youtube