## Tensorflow Hub

Tensorflow Hub provides reusable components (modules). These are tensorflow graphs reusable across other similar tasks

[ Sentiment Analysis on the IMDB dataset](http://ai.stanford.edu/~amaas/data/sentiment/)

In [27]:
import tensorflow as tf
import tensorflow_hub as tf_hub
import numpy as np

import matplotlib.pyplot as plt

import seaborn

import re
import os

In [2]:
# constants

n_classes = 2 # Positive, Negative
hidden_units = [500, 100]
lr = 1e-4

#### Acquire and preprocess the review data

In [17]:
dir_name = 'imdb_set'

def fetch_data():
    """
        Downloads the required data files
    """
    data_path = tf.keras.utils.get_file(
        fname=dir_name,
        origin='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
        extract=True
    )

In [20]:
# Download the data

d_path = fetch_data()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
 4472832/84125825 [>.............................] - ETA: 16:26

KeyboardInterrupt: 

In [None]:
# Get train and test data frames

train_frame = load_data(os.path.join(d_path, dir_name, 'train'))
test_frame = load_data(os.path.join(d_path, dir_name, 'test'))

In [12]:
# File naming format: file_sentiment e.g xx_3.txt -> sentiment: 2

def create_frame(dir_path):
    """
        Creates a DataFrame from the loaded files in
        the given directory path
    """
    data = {'description': [],
           'sentiment': []
           }
    pattern = '\d+_(\d+)\.txt'
    pattern = re.compile(pattern)
    
    for file in os.listdir(dir_path):
        with tf.io.gfile.GFile(file) as f:
            data.get('description').append(f.read())
            data.get('sentiment').append(pattern.match(file)).group(1)
    
    return pd.DataFrame.from_dict(data)

In [14]:
def load_data(data_path):
    """
        Gets the positive and negative data samples
        and concatenates them
    """
    
    positive_frame = create_frame(os.path.join(data_path, 'pos'))
    neg_frame = create_frame(os.path.join(data_path, 'neg'))
    
    # Add polarities
    positive_frame['polarity'] = 1
    neg_frame['polarity'] = 0
    
    frame = pd.concat([positive_frame, neg_frame])
    
    # Shuffle and insert new numerical index
    return frame.sample(frac=1).reset_index(drop=True)

##### Create the dataset functions

In [None]:
# Training input on whole dataset with no epoch limit
training_input_func = tf.compat.v1.estimator.inputs.pandas_input_fn(train_frame,
                                                                   train_frame.get('polarity'),
                                                                    num_epochs=None,
                                                                    shuffle=True
                                                                   )
# predict on training set
predict_train_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(train_frame,
                                                                      train_frame.get('polarity'),
                                                                      shuffle=False)

# Predict on test set
predict_test_input_fn = tf.compat.v1.estimator.inputs.pandas_input_fn(test_frame,
                                                                      test_frame.get('polarity'),
                                                                      shuffle=False)

#####  Feature column

A feature column is an intermediary between raw data and an Estimator.

It takes the data in numerical or categorical form and transforms it into a form suitable for an estimator

In [None]:
# Embedding text feature column

feature_col = tf_hub.text_embedding_column(
    key='description',
    module_spec='https://tfhub.dev/google/nnlm-en-dim128/1')

##### Build and train the Estimator

In [None]:
estimator = tf.estimator.DNNClassifier(hidden_units=hidden_units,
                                      feature_columns=[feature_col],
                                      n_classes=n_classes,
                                      optimizer=tf.keras.optimizers.Adam(lr),
                                      loss_reduction=tf.losses.Reduction.SUM,
                                      model_dir='/tmp/imdb_s_analysis')

In [None]:
estimator.train(input_fn=train_input_fn, steps=1000)

##### Evaluate model

We use a **confusion matrix** for evaluation

In [24]:
def get_predictions(input_fn):
    return [prediction['class_ids'][0]
            for prediction in estimator.predict(input_fn=input_fn)
           ]
    

In [None]:
confusion = tf.math.confusion_matrix(labels=train_frame.get('polarity'),
                             predictions=get_predictions(predict_train_input_fn))
print(confusion.numpy())

In [None]:
# Normalize the confusion matrix: [Rows will add up to 1]

top = confusion.numpy()
bottom = np.sum(top)
confusion = 2 * top / bottom

In [None]:
# Plot the matrix
labels = ['negative', 'positive']

seaborn.heatmap(confusion, annot=True, xticklabels=labels, yticklabels=labels)
plt.xlabel('predicted')
plt.ylabel('true')

In [31]:
# test set confusion  matrix

conf_test = tf.math.confusion_matrix(test_frame.get('polarity'),
                                    predictions=get_predictions(predict_test_input_fn))

# Normalize
top = conf_test.numpy()
bottom = np.sum(top)
conf_test = 2 * top / bottom

seaborn.heatmap(conf_test, annot=True, xticklabels=labels, yticklabels=labels)
plt.xlabel='predicted'
plt.ylabel='true'