# Caracteristicas
* Data normalizada = No
* Funcion de entrada de datos = tf.estimator.inputs.pandas_input_fn
* Modelo = DNNClassifier canned

tomado de
*  https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub
*  https://colab.research.google.com/github/tensorflow/hub/blob/master/docs/tutorials/text_classification_with_tf_hub.ipynb
*  https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/cross_lingual_similarity_with_tf_hub_multilingual_universal_encoder.ipynb#scrollTo=8xdAogbxJDTD

# Librerias

In [0]:
# Instalar TFHub
!pip install -q tensorflow-hub

In [2]:
# Librerias principales
import tensorflow as tf
import tensorflow_hub as tfh
import numpy as np
print(tf.__version__)

# Librerias accesorias
import re
import os
import pandas as pd

# Disminucion de los mensajes solo aquellos que dan error
tf.logging.set_verbosity(tf.logging.ERROR)

# Parametros de configuracion

1.14.0-rc1


# Data

## Preprocessing

In [3]:
# 1.1.1 Load all files from a directory in a DataFrame
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  
  return pd.DataFrame.from_dict(data)

# 1.1 Merge positive and negative examples, add a polarity column and shuffle
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
  
# 1 Download and process the dataset files
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(fname="aclImdb.tar.gz",
                                   origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                   extract=True,
                                   cache_dir="/content")
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "train"))
  test_df  = load_dataset(os.path.join(os.path.dirname(dataset),"aclImdb", "test"))
  
  return train_df, test_df

train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,We have to remember that the 50's were practic...,8,1
1,"Something about ""Paulie"" touched my heart as f...",10,1
2,I can't say too much about Kalifornia as sadly...,7,1
3,I saw this movie when I was about 8-years-old ...,8,1
4,"Go, Igor, go, you are the proof that Slovenian...",10,1


## Ingestion functions

In [0]:
# Data ingest
def train_input_fn():  
  dataset = tf.data.Dataset.from_tensor_slices(({'sentence': np.array(train_df['sentence'])},
                                                train_df['polarity']))

  return dataset.shuffle(100).repeat(None).batch(100).prefetch(tf.data.experimental.AUTOTUNE)

def eval_input_fn():
   return tf.estimator.inputs.pandas_input_fn(x=train_df,
                                             y=train_df["polarity"],
                                             num_epochs=None,
                                             shuffle=False)

def test_input_fn():
  return tf.estimator.inputs.pandas_input_fn(x=test_df,
                                            y=test_df["polarity"],
                                            num_epochs=None,
                                            shuffle=False)

def serving_input_fn():
  def _input_fn():
    json_feature_placeholders = {'sentence': tf.placeholder(tf.string, [None])}
    features = json_feature_placeholders
    return tf.estimator.export.ServingInputReceiver(features, json_feature_placeholders)
  return _input_fn

# Modelo

## Features, variables y placeholders

In [0]:
feature_column = tfh.text_embedding_column(key="sentence",
                                          module_spec="https://tfhub.dev/google/nnlm-en-dim128/1",
                                          trainable=True)

#module = tfh.Module("https://tfhub.dev/google/nnlm-en-dim128/1")
#print(module.get_input_info_dict(), module.get_output_info_dict())

## Estimador

In [0]:
# Modelo: DNN canned
estimator = tf.estimator.DNNClassifier(hidden_units=[500, 100],
                                       feature_columns=[feature_column],
                                       n_classes=2,
#                                       activation_fn=tf.nn.relu,
#                                       dropout=0.1,
                                       optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

# Serving: model
export_latest = tf.estimator.LatestExporter('exporter',
                                           serving_input_receiver_fn=serving_input_fn())

# Produccion: data
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                   max_steps=1000)

eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn(),
                                 exporters=export_latest)

## Entrenamiento y evaluacion, validacion, test y prediccion

In [7]:
# Entrenamiento y evaluacion
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)

({'accuracy': 0.9338281,
  'accuracy_baseline': 0.5001562,
  'auc': 0.9803551,
  'auc_precision_recall': 0.9808867,
  'average_loss': 0.18347463,
  'global_step': 1000,
  'label/mean': 0.5001562,
  'loss': 23.484753,
  'precision': 0.9457551,
  'prediction/mean': 0.48596093,
  'recall': 0.9204936},
 [b'/tmp/tmp72rosmqu/export/exporter/1561143941'])

In [8]:
# Prediccion
predict = {}
predict["sentence"] = []
predict["sentence"].append(test_df['sentence'][3])
predict_pd = pd.DataFrame.from_dict(predict)
print(predict_pd)
print('***********')

#predictions = estimator.predict(input_fn=test_input_fn())
predictions = estimator.predict(input_fn=tf.estimator.inputs.pandas_input_fn(x=predict_pd, num_epochs=None, shuffle=False))

#for items in predictions:
#  print(items)

import itertools
print([pred['logistic'] for pred in list(itertools.islice(predictions, len(predict_pd)))])

                                            sentence
0  As you can tell from the other comments, this ...
***********
[array([0.00061354], dtype=float32)]
