<a href="https://colab.research.google.com/github/iamdsc/advancedNLP/blob/master/09_advNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Elmo Embeddings

In [1]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer

Using TensorFlow backend.


In [0]:
# Load all files from a directory in a Dataframe
def load_directory_data(directory):
  data = {}
  data['sentence'] = []
  data['sentiment'] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), 'r') as f:
      data['sentence'].append(f.read())
      data['sentiment'].append(re.match('\d+_(\d+)\.txt', file_path).group(1))
  return pd.DataFrame.from_dict(data)

In [0]:
# Merge positive and negative examples, add a polarity columns and shuffle
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, 'pos'))
  neg_df = load_directory_data(os.path.join(directory, 'neg'))
  pos_df['polarity'] = 1
  neg_df['polarity'] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

In [0]:
# Download and process the dataset files
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
    fname='aclImdb.tar.gz',
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True)
  train_df = load_dataset(os.path.join(os.path.dirname(dataset),'aclImdb','train'))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset),'aclImdb','test'))
  return train_df, test_df

In [5]:
train_df, test_df = download_and_load_datasets()
train_df.head()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


Unnamed: 0,sentence,sentiment,polarity
0,"Normally, I don't watch action movies because ...",7,1
1,"What a disappointment, especially in light of ...",2,0
2,Whatever happened to British TV drama? From Jo...,2,0
3,"I have seen many, many productions of The Nutc...",4,0
4,"Oh man, I know what your thinking: ""With a tit...",2,0


In [0]:
# Create a custom layer that allows us to update weights
class ElmoEmbeddingLayer(Layer):
  def __init__(self, **kwargs):
    self.dimensions = 1024
    self.trainable=True
    super(ElmoEmbeddingLayer, self).__init__(**kwargs)
  
  def build(self, input_shape):
    self.elmo=hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable, name='{}_module'.format(self.name))
    self.trainable_weights += tf.trainable_variables(scope='^{}_module/.*'.format(self.name))
    super(ElmoEmbeddingLayer, self).build(input_shape)
  
  def call(self, x, mask=None):
    result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                       as_dict=True,
                       signature='default',
                       )['default']
    return result
  
  def compute_mask(self, inputs, mask=None):
    return K.not_equal(inputs, '--PAD--')
  
  def compute_output_shape(self, input_shape):
    return (input_shape[0], self.dimensions)

In [0]:
# Function to build model
def build_model():
  input_text = layers.Input(shape=(1,), dtype='string')
  embedding = ElmoEmbeddingLayer()(input_text)
  dense = layers.Dense(256, activation='relu')(embedding)
  pred = layers.Dense(1, activation='sigmoid')(dense)
  
  model = Model(inputs=[input_text], outputs=pred)
  
  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
  model.summary()
  return model

In [0]:
# Create datasets
# taking max 150 words in each sentence
train_text = train_df['sentence'].tolist() 
train_text = [' '.join(t.split()[:150]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['polarity'].tolist()

test_text = test_df['sentence'].tolist()
test_text = [' '.join(t.split()[:150]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['polarity'].tolist()

In [9]:
# Build the model
model = build_model()



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore




















Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 262,661
Trainable params: 262,661
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Fit the model
model.fit(train_text, train_label, validation_data=(test_text, test_label), epochs=1, batch_size=32)