https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/gan.py
https://stackoverflow.com/questions/40994583/how-to-implement-tensorflows-next-batch-for-own-data

In [1]:
import xml.etree.ElementTree as ET
import csv


def get_labels_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict


def get_features_labels(root, labels_dict):
    corpus = [] # each row is a string formed from all messages in a conversations
    labels = [] # each row is 0 or 1, corresponds to label for same row in corpus

    for conversation in root:
        string = " "
        for message in conversation:
            text = message.find('text').text
            if text is not None:
                string = string + "\r\n" + text 
        corpus.append(string)
        labels.append(int(labels_dict[conversation.get('id')]))
    return corpus, labels

In [5]:
train_data_path = '../../data/svm_training_data/'
training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()

test_data_path = '../../data/svm_test_data/'
test_data_src = '../../data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_src + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()

train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
test_corpus, test_labels = get_features_labels(test_root, get_labels_dict(test_data_path))

train_corpus_norm = []
train_corpus_susp = []
train_labels_norm = []
train_labels_susp = []
for index in range(len(train_corpus)):
    if train_labels[index] == 1:
        train_corpus_susp.append(train_corpus[index])
        train_labels_susp.append(train_labels[index])
    else:
        train_corpus_norm.append(train_corpus[index])
        train_labels_norm.append(train_labels[index])

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import scipy
# from sklearn.model_selection import train_test_split
import numpy as np

vectorizer = TfidfVectorizer()
_ = vectorizer.fit_transform(train_corpus)
X_train_norm = vectorizer.transform(train_corpus_norm)
X_train_susp = vectorizer.transform(train_corpus_susp)
X_test = vectorizer.transform(test_corpus)

X_train_norm = scipy.sparse.csr_matrix(X_train_norm)
y_train_norm = np.array(train_labels_norm)
X_train_susp = scipy.sparse.csr_matrix(X_train_susp)
y_train_susp = np.array(train_labels_susp)
X_test = scipy.sparse.csr_matrix(X_test)
y_test = np.array(test_labels)

print(np.min(X_train_norm[0]))
print(np.max(X_train_norm[0]))

X_train_norm = normalize(X_train_norm)
X_train_susp = normalize(X_train_susp)
X_test = normalize(X_test)

print(np.min(X_train_norm[0]))
print(np.max(X_train_norm[0]))

# X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=87)
# print("Train data shape:{}\r\nTest data shape:{}".format(X_train.shape, X_test.shape))

0.0
0.44709803254549474
0.0
0.44709803254549474


In [15]:
""" Generative Adversarial Networks (GAN).
Using generative adversarial networks (GAN) to generate digit images from a
noise distribution.
References:
    - Generative adversarial nets. I Goodfellow, J Pouget-Abadie, M Mirza,
    B Xu, D Warde-Farley, S Ozair, Y. Bengio. Advances in neural information
    processing systems, 2672-2680.
    - Understanding the difficulty of training deep feedforward neural networks.
    X Glorot, Y Bengio. Aistats 9, 249-256
Links:
    - [GAN Paper](https://arxiv.org/pdf/1406.2661.pdf).
    - [MNIST Dataset](http://yann.lecun.com/exdb/mnist/).
    - [Xavier Glorot Init](www.cs.cmu.edu/~bhiksha/courses/deeplearning/Fall.../AISTATS2010_Glorot.pdf).
Author: Aymeric Damien
Project: https://github.com/aymericdamien/TensorFlow-Examples/
"""

from __future__ import division, print_function, absolute_import

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

# Import MNIST data
# from tensorflow.examples.tutorials.mnist import input_data
# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

# Training Params
num_steps = 10
batch_size = 100 #128
learning_rate = 0.0002

# Network Params
image_dim = X_train_norm.shape[1] #784 # 28*28 pixels
gen_hidden_dim = 10 #256
disc_hidden_dim = 10 #256
noise_dim = X_train_norm.shape[1] # 100 # Noise data points
print(X_train_norm.shape[1])

# A custom initialization (see Xavier Glorot init)
def glorot_init(shape):
    return tf.random_normal(shape=shape, stddev=1. / tf.sqrt(shape[0] / 2.))

# Store layers weight & bias
weights = {
    'gen_hidden1': tf.Variable(glorot_init([noise_dim, gen_hidden_dim])),
    'gen_out': tf.Variable(glorot_init([gen_hidden_dim, image_dim])),
    'disc1_hidden1': tf.Variable(glorot_init([image_dim, disc_hidden_dim])),
    'disc1_out': tf.Variable(glorot_init([disc_hidden_dim, 1])),
    'disc2_hidden1': tf.Variable(glorot_init([image_dim, disc_hidden_dim])),
    'disc2_out': tf.Variable(glorot_init([disc_hidden_dim, 1])),
}
biases = {
    'gen_hidden1': tf.Variable(tf.zeros([gen_hidden_dim])),
    'gen_out': tf.Variable(tf.zeros([image_dim])),
    'disc1_hidden1': tf.Variable(tf.zeros([disc_hidden_dim])),
    'disc1_out': tf.Variable(tf.zeros([1])),
    'disc2_hidden1': tf.Variable(tf.zeros([disc_hidden_dim])),
    'disc2_out': tf.Variable(tf.zeros([1])),
}


# Generator
def generator(x):
    hidden_layer = tf.matmul(x, weights['gen_hidden1'])
    hidden_layer = tf.add(hidden_layer, biases['gen_hidden1'])
    hidden_layer = tf.nn.relu(hidden_layer)
    out_layer = tf.matmul(hidden_layer, weights['gen_out'])
    out_layer = tf.add(out_layer, biases['gen_out'])
    out_layer = tf.nn.sigmoid(out_layer)
    return out_layer


# Discriminator
def discriminator_SCI(x): # is D in paper
    hidden_layer = tf.matmul(x, weights['disc1_hidden1'])
    hidden_layer = tf.add(hidden_layer, biases['disc1_hidden1'])
    hidden_layer = tf.nn.relu(hidden_layer)
    out_layer = tf.matmul(hidden_layer, weights['disc1_out'])
    out_layer = tf.add(out_layer, biases['disc1_out'])
    out_layer = tf.nn.sigmoid(out_layer)
    return out_layer

def discriminator_gvr(x): # is D prime in paper, discriminator_generated_vs_real
    hidden_layer = tf.matmul(x, weights['disc2_hidden1'])
    hidden_layer = tf.add(hidden_layer, biases['disc2_hidden1'])
    hidden_layer = tf.nn.relu(hidden_layer)
    out_layer = tf.matmul(hidden_layer, weights['disc2_out'])
    out_layer = tf.add(out_layer, biases['disc2_out'])
    out_layer = tf.nn.sigmoid(out_layer)
    return out_layer

def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , data.shape[0])
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

#     return scipy.sparse.csr_matrix(data_shuffle), np.asarray(labels_shuffle)
    return np.asarray(data_shuffle), np.asarray(labels_shuffle)


121394


In [16]:
# Build Networks
# Network Inputs
gen_input = tf.placeholder(tf.float32, shape=[None, noise_dim], name='input_noise')
disc_input_normal = tf.placeholder(tf.float32, shape=[None, image_dim], name='disc_input_normal')
disc_input_real_susp = tf.placeholder(tf.float32, shape=[None, image_dim], name='disc_input_real_susp')
# disc_input_fake_susp = tf.placeholder(tf.float32, shape=[None, image_dim], name='disc_input_fake_susp')

# Build Generator Network
gen_sample = generator(gen_input)

# Build 2 Discriminator Networks (one from noise input, one from generated samples)
disc_SCI_normal = discriminator_SCI(disc_input_normal)
disc_SCI_susp_real = discriminator_SCI(disc_input_real_susp)
disc_SCI_susp_fake = discriminator_SCI(gen_sample)
disc_gvr_real = discriminator_gvr(disc_input_real_susp)
disc_gvr_fake = discriminator_gvr(gen_sample)

# Build Loss
gen_loss = -tf.reduce_mean(tf.log(disc_SCI_susp_fake) + tf.log(disc_gvr_fake))
disc_SCI_loss = -tf.reduce_mean(tf.log(disc_SCI_normal) + tf.log(1. - disc_SCI_susp_real) + tf.log(1. - disc_SCI_susp_fake))
disc_gvr_loss = -tf.reduce_mean(tf.log(disc_gvr_real) + tf.log(1. - disc_gvr_fake))

# Build Optimizers
optimizer_gen = tf.train.AdamOptimizer(learning_rate=learning_rate)
optimizer_disc = tf.train.AdamOptimizer(learning_rate=learning_rate)

# Training Variables for each optimizer
# By default in TensorFlow, all variables are updated by each optimizer, so we
# need to precise for each one of them the specific variables to update.
# Generator Network Variables
gen_vars = [weights['gen_hidden1'], weights['gen_out'],
            biases['gen_hidden1'], biases['gen_out']]
# Discriminator Network Variables
disc_SCI_vars = [weights['disc1_hidden1'], weights['disc1_out'],
            biases['disc1_hidden1'], biases['disc1_out']]
disc_gvr_vars = [weights['disc2_hidden1'], weights['disc2_out'],
            biases['disc2_hidden1'], biases['disc2_out']]

# Create training operations
train_gen = optimizer_gen.minimize(gen_loss, var_list=gen_vars)
train_disc_SCI = optimizer_disc.minimize(disc_SCI_loss, var_list=disc_SCI_vars)
train_disc_gvr = optimizer_disc.minimize(disc_gvr_loss, var_list=disc_gvr_vars)


In [18]:
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

# Start training
with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for i in range(1, num_steps+1):
        # Prepare Data
        # Get the next batch of MNIST data (only images are needed, not labels)
#         batch_x, _ = mnist.train.next_batch(batch_size)
        batch_x_norm, _ = next_batch(batch_size, X_train_norm, y_train_norm)
        batch_x_susp, _ = next_batch(batch_size, X_train_susp, y_train_susp)
        print(batch_x_norm.shape)
        print(batch_x_susp.shape)
        # Generate noise to feed to the generator
        z = np.random.uniform(-1., 1., size=[batch_size, noise_dim])
        print()
        # Train
#         feed_dict = {disc_input: batch_x, gen_input: z}
        feed_dict = {disc_input_normal: batch_x_norm, disc_input_real_susp: batch_x_susp, gen_input: z}
        _, _, _, gl, dSCIl, dgvrl = sess.run([train_gen, train_disc_SCI, train_disc_gvr, gen_loss, disc_SCI_loss, disc_gvr_loss],
                                feed_dict=feed_dict)
        if i % 1000 == 0 or i == 1:
            print('Step %i: Generator Loss: %f, Discriminator Loss: %f' % (i, gl, dl))

    print("Finished Training")
    # test SCI
    y_pred = sess.run([disc_SCI_normal], feed_dict={disc_input_normal: X_test})
    print(metrics.accuracy_score(y_test, y_pred))
#     # Generate images from noise, using the generator network.
#     f, a = plt.subplots(4, 10, figsize=(10, 4))
#     for i in range(10):
#         # Noise input.
#         z = np.random.uniform(-1., 1., size=[4, noise_dim])
#         g = sess.run([gen_sample], feed_dict={gen_input: z})
#         g = np.reshape(g, newshape=(4, 28, 28, 1))
#         # Reverse colours for better display
#         g = -1 * (g - 1)
#         for j in range(4):
#             # Generate image from noise. Extend to 3 channels for matplot figure.
#             img = np.reshape(np.repeat(g[j][:, :, np.newaxis], 3, axis=2),
#                              newshape=(28, 28, 3))
#             a[j][i].imshow(img)

#     f.show()
#     plt.draw()
#     plt.waitforbuttonpress()

ValueError: setting an array element with a sequence.