In [12]:
# https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/examples/skflow/text_classification.py

import datetime as dt
import os
import sys

import numpy as np
import pandas
from sklearn import metrics
from sklearn.cross_validation import train_test_split
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib.learn import extract_pandas_data

In [2]:
FLAGS = tf.app.flags.FLAGS
tf.app.flags.DEFINE_bool('test_with_fake_data', False,
                         'Test the example code with fake data.')

In [3]:
MAX_DOCUMENT_LENGTH = 10
EMBEDDING_SIZE = 50
n_words = 0

In [4]:
def bag_of_words_model(x, y):
  """A bag-of-words model. Note it disregards the word order in the text."""
  target = tf.one_hot(y, 15, 1, 0)
  word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')
  features = tf.reduce_max(word_vectors, reduction_indices=1)
  prediction, loss = learn.models.logistic_regression(features, target)
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)
  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op

In [5]:
def rnn_model(x, y):
  """Recurrent neural network model to predict from sequence of words
  to a class."""
  # Convert indexes of words into embeddings.
  # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
  # maps word indexes of the sequence into [batch_size, sequence_length,
  # EMBEDDING_SIZE].
  word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')

  # Split into list of embedding per word, while removing doc length dim.
  # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
  word_list = tf.unpack(word_vectors, axis=1)

  # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
  cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)

  # Create an unrolled Recurrent Neural Networks to length of
  # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
  _, encoding = tf.nn.rnn(cell, word_list, dtype=tf.float32)

  # Given encoding of RNN, take encoding of last step (e.g hidden size of the
  # neural network of last step) and pass it as features for logistic
  # regression over output classes.
  target = tf.one_hot(y, 15, 1, 0)
  prediction, loss = learn.models.logistic_regression(encoding, target)

  # Create a training op.
  train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)

  return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op

In [8]:
df_tlg_bow = pandas.read_csv(os.path.expanduser('~/cltk_data/user_data/tlg_bow.csv'))
Y = df_tlg_bow['epithet']
X = df_tlg_bow.drop(['epithet', 'id', 'author'], 1)  #? column "Unnamed: 0"
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0)

In [25]:
def df_to_tensor(df):
    np_array = df.as_matrix()
    tensor = tf.convert_to_tensor(np_array, dtype=tf.string)
    return tf.matmul(tensor, tensor) + tensor

TypeError: DataType string for attr 'T' not in list of allowed values: float16, float32, float64, int32, complex64, complex128

In [13]:
type(extract_pandas_data(y_test))

pandas.core.series.Series

In [6]:
def main(x_train, x_test, y_train, y_test):
    global n_words
    
    t0 = dt.datetime.utcnow()
    
  # Prepare training and testing data
#   dbpedia = learn.datasets.load_dataset(
#       'dbpedia', test_with_fake_data=FLAGS.test_with_fake_data)
#   x_train = pandas.DataFrame(dbpedia.train.data)[1]
#   y_train = pandas.Series(dbpedia.train.target)
#   x_test = pandas.DataFrame(dbpedia.test.data)[1]
#   y_test = pandas.Series(dbpedia.test.target)

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))
    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Build model
    classifier = learn.Estimator(model_fn=bag_of_words_model)

    # Train and predict
    classifier.fit(x_train, y_train, steps=5)  #! was 100
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)]
    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
    
    print('... finished in {}'.format(dt.datetime.utcnow() - t0))

In [7]:
tf.app.run(x_train, x_test, y_train, y_test)



Total words: 59206


TypeError: DataType string for attr 'TI' not in list of allowed values: uint8, int32, int64