# 0 Caracteristicas
* Data normalizada = No
* Funcion de entrada de datos = tf.estimator.inputs.pandas_input_fn
* Modelo = DNNClassifier canned

tomado de
*  https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub
*  https://colab.research.google.com/github/tensorflow/hub/blob/master/docs/tutorials/text_classification_with_tf_hub.ipynb
*  https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb#scrollTo=8xdAogbxJDTD

# 1 Librerias

In [0]:
# Instalar TFHub
!pip install -q tensorflow-hub

In [0]:
# Librerias principales
import tensorflow as tf
import tensorflow_hub as tfh
import numpy as np
print(tf.__version__)

W0612 20:35:17.517228 140339635406720 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


1.13.1


In [0]:
# Librerias accesorias
import re
import os
import pandas as pd

In [0]:
# Parametros de configuracion

# Disminucion de los mensajes solo aquellos que dan error
tf.logging.set_verbosity(tf.logging.ERROR)

# 2 Features, variables y placeholders

In [0]:
feature_column = tfh.text_embedding_column(key="sentence",
                                          module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
                                          trainable=True)

In [0]:
module = tfh.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
print(module.get_input_info_dict(), module.get_output_info_dict())

{'default': <hub.ParsedTensorInfo shape=(?,) dtype=string is_sparse=False>} {'default': <hub.ParsedTensorInfo shape=(?, 128) dtype=float32 is_sparse=False>}


# 3 Data

Este dependerá de cada modelo

## Preprocessing

In [0]:
# 1.1.1 Load all files from a directory in a DataFrame
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  
  return pd.DataFrame.from_dict(data)

# 1.1 Merge positive and negative examples, add a polarity column and shuffle
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
  
# 1 Download and process the dataset files
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(fname="aclImdb.tar.gz",
                                   origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                   extract=True,
                                   cache_dir="/content")
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
  test_df  = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "test"))
  
  return train_df, test_df

train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,"Yowsa! If you REALLY want some ACTION, check o...",10,1
1,- A film crew is shooting a horror movie in an...,3,0
2,Of all the versions of the Odyssey (or of any ...,10,1
3,my wife is a fanatic as regards this show. Tha...,1,0
4,* Some spoilers *<br /><br />This movie is som...,3,0


## Funciones entrada dataset

In [0]:
# Data ingest
def read_dataset(x, y, mode, batch_size=20):
#  x_ds = tf.data.Dataset.from_tensor_slices(x)
#  y_ds = tf.data.Dataset.from_tensor_slices(tf.cast(y, tf.int64))
#  
#  ds = tf.data.Dataset.zip((x_ds, y_ds))
#  
#  if mode == tf.estimator.ModeKeys.TRAIN:
#    num_epochs = None
#    ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=batch_size*10))
#  else:
#    num_epochs = 1
#  
#  ds = ds.batch(batch_size).repeat(num_epochs).prefetch(tf.data.experimental.AUTOTUNE)
#  return ds
   pass

def train_input_fn():
   return tf.estimator.inputs.pandas_input_fn(x=train_df,
                                             y=train_df["polarity"],
                                             num_epochs=None,
                                             shuffle=True)

def eval_input_fn():
#  return read_dataset(x = x_test, y = y_test, mode = tf.estimator.ModeKeys.EVAL)
   return tf.estimator.inputs.pandas_input_fn(x=train_df,
                                             y=train_df["polarity"],
                                             num_epochs=None,
                                             shuffle=False)

def test_input_fn():
  return tf.estimator.inputs.pandas_input_fn(x=test_df,
                                            y=test_df["polarity"],
                                            num_epochs=None,
                                            shuffle=False)

def serving_input_fn():
  def _input_fn():
    json_feature_placeholders = {'sentence': tf.placeholder(tf.string, [None])}
    features = json_feature_placeholders
    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)
  return _input_fn

# Modelo

In [0]:
# Modelo: DNN canned
estimator = tf.estimator.DNNClassifier(hidden_units=[500, 100],
                                      feature_columns=[feature_column],
                                      n_classes=2,
                                      optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

# Serving: model
export_latest = tf.estimator.LatestExporter('exporter',
                                           serving_input_receiver_fn=serving_input_fn())

# Produccion: data
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn(),
                                   max_steps=1000)

eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn(),
                                 exporters=export_latest)

# Entrenamiento
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

({'accuracy': 0.9496094,
  'accuracy_baseline': 0.50554687,
  'auc': 0.9878236,
  'auc_precision_recall': 0.988619,
  'average_loss': 0.14912552,
  'global_step': 1000,
  'label/mean': 0.50554687,
  'loss': 19.088066,
  'precision': 0.93870485,
  'prediction/mean': 0.52410305,
  'recall': 0.96322054},
 [b'/tmp/tmpllaevb6o/export/exporter/1560368383'])

In [0]:
predict = {}
predict["sentence"] = []
predict["sentence"].append(test_df['sentence'][4])
predict_pd = pd.DataFrame.from_dict(predict)
predict_pd

Unnamed: 0,sentence
0,Do not rent this movie. I ended up buying the ...


In [0]:
#predictions = estimator.predict(input_fn=test_input_fn())
predictions = estimator.predict(input_fn=tf.estimator.inputs.pandas_input_fn(x=predict_pd, num_epochs=None, shuffle=False))

In [0]:
#for items in predictions:
#  print(items)

In [0]:
import itertools
print([pred['logistic'][0] for pred in list(itertools.islice(predictions, len(predict_pd)))])

[0.0039671957]


In [0]:
test_df['sentence'][5]

"The plot is very basic, but acceptable: A young US soldier, having studied Asian martial arts, puts this to good use in defence of the general's daughter and falls in love with her. Unfortunately, the three main acting parties' (soldier, girl, rebels) combined IQ amounts to 3: One point for the soldier, one for the girl, a large number of Ninja fighters have to share the third point among them to be able to lose against the former two."