The code below was the result of training several models on Sentiment140 using Tensorflow 2.0

In [1]:
# Check CUDA Version
!cat /usr/local/cuda/version.txt;
!nvcc --version

CUDA Version 10.0.130
/bin/sh: 1: nvcc: not found


In [2]:
# !cat /usr/local/cuda/include/cudnn.h # | grep CUDNN_MAJOR -A 2

In [3]:
# Check your versions
!python --version;
!pip freeze | grep tensorflow;
!pip freeze | grep cudf;
!pip freeze | grep pandas;
!pip freeze | grep numpy;

Python 3.7.4
tensorflow==2.0.0
tensorflow-datasets==1.2.0
tensorflow-estimator==2.0.0
tensorflow-gpu==2.0.0
tensorflow-hub==0.6.0
tensorflow-metadata==0.15.0
pandas==0.25.1
numpy==1.17.2


In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from pathlib import Path
import pandas as pd
import numpy as np
import typing
import numbers
import os
import unittest
import random
import copy

In [5]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
# Ensure training on one GPU

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPU


In [8]:
# DATA PARAMETERS

splits = 5 # 5 splits means each validation set is 20% of original set, with 80:20 validation:training set ratio

file = 'training.1600000.processed.noemoticon.csv'
path = Path('../mydata')

df = pd.read_csv(path/file, usecols=[0,5],header=None, names=['label','text'],encoding='latin-1')

In [9]:
# PREPARE DATA

# Randomize data
df = df.iloc[np.random.permutation(len(df))]

# For Binary Classification: Convert Labels to 0 and 1 
df.loc[df['label'] == 4, 'label'] = 1

df.head()

Unnamed: 0,label,text
486678,0,Sleepy as hale. I hope the afternoon goes by s...
911387,1,@Hstreet96 always good to see you - even in cy...
1140877,1,http://tinyurl.com/lsee25 &lt;&lt; my new hams...
458442,0,@sgBEAT:MattBinks why not?! You don want me an...
1593831,1,on my way to my BNI meeting. should be a good ...


In [10]:
# CREATE SUBSETS

# Equal length subsets of original dataframe
cut_indices = [int(i*(1/splits)*len(df)) for i in range(0,splits+1)] # indices where df is to be cut
segment_indices = zip(cut_indices[:-1], cut_indices[1:]) # indices for each cut segment
valids = [df[begin:end] for begin,end in segment_indices] # subsets each to be used as validation sets

# Training sets for each validation set in valids
trains = [pd.concat(valids[1:], axis=0)]
for n in range(1,splits):
    trains += [pd.concat(valids[:n]+valids[n+1:], axis=0)] # (all sets except for set n in range(splits))

# Validation and Training Sets to be used
validation = valids[0]
training = trains[0]

In [11]:
# a mean close to 0.5 for labels indicates a well balanced dataset
validation['label'].describe()['mean']

0.4980875

In [12]:
# VECTORIZE DATASET

vocab_size = 5000 # 10000

text = training['text'].to_numpy()

tok = Tokenizer(num_words=vocab_size, oov_token='<unk>')

tok.fit_on_texts(text)

tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

# pad vectors to maxlength
train_text = training['text'].to_numpy()
train_seqs = tok.texts_to_sequences(train_text)
maxlength = max(len(i) for i in train_seqs)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=maxlength, padding='post')

train_labels = training['label'].to_numpy().flatten()

# pad vectors to maxlength (don't calculate automatically, or valid set will be of different size)
valid_text = validation['text'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, maxlen=maxlength, padding='post')

valid_labels = validation['label'].to_numpy().flatten()

# Use lowest possible types to speed up training
train_seqs = train_seqs.astype('uint16')
valid_seqs = valid_seqs.astype('uint16')
train_labels = train_labels.astype('bool')
valid_labels = valid_labels.astype('bool') # uint8 or bool?

# Convert to TF dataset format
train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

print(maxlength)

118


In [13]:
# USEFUL FUNCTIONS

def word2vec(words):
    if isinstance(words, str):
        "Takes a string of words and returns a list of corresponding integers"
        seq = tok.texts_to_sequences([words])
        return np.array(seq).flatten().tolist()
    elif isinstance(words, typing.Iterable):
        "Takes a list of strings and returns a list of sequences (lists of corresponding integers)"
        return tok.texts_to_sequences(words)
    else:
        raise ValueError(f'Words were of type {type(words)} but should be either a string or list of strings')
        

def vec2word(vec:typing.Iterable[typing.Any]):
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def vec2word_no_pad(vec:typing.Iterable[typing.Any]):
    "Removes padding and converts vectors of ints to strings"
    if isinstance(vec[0], numbers.Number):
        "Takes a list of ints and returns a corresponding string"
        return " ".join([list(tok.word_index.keys())[i-1] for i in vec if i != 0])
    elif isinstance(vec[0], typing.Iterable):
        "Takes an array of sequences (i.e., a 2d array) and returns an array of strings"
        return [vec2word(i) for i in vec if i != 0]
    else:
        raise ValueError( f'Input list should contain either ints or lists of ints, not {type(vec[0])}')

def show_batch(ds):
    "Takes a tensorflow dataset and returns a batch as a dataframe, with labels shown without padding"
    batch_vecs, batch_targets = next(iter(ds.batch(1)))  # iterate through dataset batches
    batch_vecs, batch_targets = np.array(batch_vecs)[0], np.array(batch_targets)[0]  # convert tf batch to np array & reduce dimension by 1
    return pd.DataFrame(zip(batch_vecs, [vec2word_no_pad(arr) for arr in batch_vecs], batch_targets), columns=['word_vec','text','target'])


In [14]:
[32*i for i in range(1,10)] # Choose a multiple of 32 for embedding dimension

[32, 64, 96, 128, 160, 192, 224, 256, 288]

In [15]:
# TRAINING PARAMETERS

# Calculates the max_length, which can be used to store the attention weights
maxlength = max(len(i) for i in train_seqs)
total_vocab_size = len(tok.word_index) # no need to add +1, word_index includes <pad>
batch_size = 512 # 256 # 128 # 64
buffer_size = 1000 # 500 # 1000
embedding_dim = 64 # 32 # 64 # 128 # 256
num_steps = len(train_text) // batch_size
epochs = num_steps // buffer_size
val_steps = len(valid_seqs) // batch_size // epochs
learning_rate = 0.001 * 8

print(maxlength, vocab_size, total_vocab_size, num_steps, epochs, val_steps)

118 5000 594935 2500 2 312


In [16]:
# SHUFFLE AND BATCH

train_batch = train_ds.shuffle(buffer_size).batch(batch_size)
valid_batch = valid_ds.shuffle(buffer_size).batch(batch_size)
train_prefetch = train_batch.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) # prefetch speeds up training
valid_prefetch = valid_batch.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

valid_batch

<BatchDataset shapes: ((None, 118), (None,)), types: (tf.uint16, tf.bool)>

In [17]:
show_batch(valid_prefetch) # Note: determining <unk> words is a bottleneck

Unnamed: 0,word_vec,text,target
0,"[452, 1, 2308, 1, 9, 25, 11, 169, 2986, 19, 11...",says <unk> greatest <unk> is not in never fail...,True
1,"[186, 16, 52, 1, 160, 278, 3496, 3, 23, 176, 4...",ok so back <unk> 4 doesn't appear to be workin...,False
2,"[9, 74, 16, 120, 62, 3699, 0, 0, 0, 0, 0, 0, 0...",is still so happy about federer,True
3,"[1, 7, 82, 2, 99, 27, 9, 4, 95, 51, 20, 11, 29...",<unk> and thanks i hope this is the last time ...,True
4,"[2608, 27, 98, 18, 625, 14, 1672, 1, 9, 90, 16...",discovered this morning that running on flat <...,True
...,...,...,...
507,"[1, 1846, 161, 2, 63, 515, 3, 1, 126, 8, 33, 6...",<unk> lmfao yes i am talking to <unk> did you ...,False
508,"[439, 296, 12, 197, 1520, 2601, 24, 1360, 2081...",sooo excited for our floor seats at american idol,True
509,"[228, 8, 632, 56, 9, 1, 114, 10, 636, 39, 1, 3...",before you ask what is <unk> why it means quot...,True
510,"[1, 2, 42, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",<unk> i do too,False


In [18]:
# Test Model

model = tf.keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
    layers.Conv1D(filters=16, kernel_size=3, padding='valid'),
    layers.MaxPool1D(),
#     layers.Dense(32,activation='relu'),
    layers.Bidirectional(layers.GRU(embedding_dim//2, return_sequences=True)), # embedding_dim//2
    layers.Bidirectional(layers.GRU(embedding_dim//2, return_sequences=True)),
    layers.Bidirectional(layers.GRU(embedding_dim//2, return_sequences=False)),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

history = model.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch, 
    validation_steps=val_steps,
    steps_per_epoch=buffer_size,
    callbacks = [] # [tf.keras.callbacks.ReduceLROnPlateau()] # cp_callback not used
    ) 

Train for 1000 steps, validate for 312 steps
Epoch 1/2
Epoch 2/2


In [25]:
# MODEL DESIGN

# Attention
use_attn = False
use_additive_attn = False

# RNNs
use_rnns = True
use_bidirectional = True
rnn = layers.GRU
hdim = embedding_dim//2 if use_bidirectional else embedding_dim  # hidden dimension of rnns
num_extra_rnns = 2  # number of additional rnn layers


In [26]:

inputs = tf.keras.Input(shape=(None,), dtype='uint16')


# ADD BEGINNING LAYERS

if use_attn:
    
    head = [
        layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        layers.Conv1D(filters=100, kernel_size=3, padding='same', activation='elu'),
    ]
    
    Q = inputs # query input
    V = inputs # value input
    
    for f in head:
        Q = f(Q)
        V = f(V)

    attn = layers.Attention() if not use_additive_attn else layers.AdditiveAttention()

    QV = attn([Q, V])

    # if rnns are used, attention layer returns sequence; otherwise, use pooling
    if not use_rnns:
        Q = layers.GlobalAveragePooling1D()(Q) # [batch_size, filters]
        QV = layers.GlobalAveragePooling1D()(QV) # [batch_size, filters]

    chain = layers.Concatenate()([Q, QV])
    
else:
    
    head = [
        layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        layers.Conv1D(filters=16, kernel_size=3, padding='valid', activation='elu'),
        layers.MaxPool1D()
    ]
    
    chain = inputs
    
    for f in head:
        chain = f(chain)

    
# ADD MIDDLE LAYERS

if use_rnns:
    rnns = [rnn(hdim, return_sequences=True)]*(num_extra_rnns) + [rnn(hdim, return_sequences=False)]
    rnns = list(map(layers.Bidirectional, rnns)) if use_bidirectional else rnns
    
    body = rnns
    
else:
    body = [
        layers.Dense(32, activation='elu')
    ]


# ADD FINAL LAYERS

body += [
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
]


# CONSTRUCT MODEL

for f in body:
    chain = f(chain)
    
outputs = chain

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 16)          3088      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 16)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 64)          9600      
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 64)          18816     
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                1881

In [35]:
# # MODEL DESIGN

# # Attention
# use_attn = True
# use_additive_attn = True

# # RNNs
# use_rnns = True
# use_bidirectional = True
# rnn = layers.GRU
# hdim = embedding_dim//2 if use_bidirectional else embedding_dim  # hidden dimension of rnns
# num_extra_rnns = 0  # number of additional rnn layers


In [27]:
# COMPILE

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0)


# USE THIS TO RELOAD MODEL
# model.load_weights(checkpoint_path)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

# RUN

history = model.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch, 
    validation_steps=val_steps,
    steps_per_epoch=buffer_size,
    callbacks = [tf.keras.callbacks.ReduceLROnPlateau()] # cp_callback not used
    )


# model.save_weights(checkpoint_path) # save weights

Train for 1000 steps, validate for 312 steps
Epoch 1/2
Epoch 2/2


In [22]:
# MODEL DESIGN

# Attention
use_attn = True
use_additive_attn = False

# RNNs
use_rnns = False
use_bidirectional = False
rnn = layers.GRU
hdim = embedding_dim//2 if use_bidirectional else embedding_dim  # hidden dimension of rnns
num_extra_rnns = 0  # number of additional rnn layers


In [23]:

inputs = tf.keras.Input(shape=(None,), dtype='uint16')


# ADD BEGINNING LAYERS

if use_attn:
    
    head = [
        layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        layers.Conv1D(filters=100, kernel_size=3, padding='same', activation='elu'),
    ]
    
    Q = inputs # query input
    V = inputs # value input
    
    for f in head:
        Q = f(Q)
        V = f(V)

    attn = layers.Attention() if not use_additive_attn else layers.AdditiveAttention()

    QV = attn([Q, V])

    # if rnns are used, attention layer returns sequence; otherwise, use pooling
    if not use_rnns:
        Q = layers.GlobalAveragePooling1D()(Q) # [batch_size, filters]
        QV = layers.GlobalAveragePooling1D()(QV) # [batch_size, filters]

    chain = layers.Concatenate()([Q, QV])
    
else:
    
    head = [
        layers.Embedding(vocab_size, embedding_dim, input_length=maxlength),
        layers.Conv1D(filters=16, kernel_size=3, padding='valid', activation='elu'),
        layers.MaxPool1D()
    ]
    
    chain = inputs
    
    for f in head:
        chain = f(chain)

    
# ADD MIDDLE LAYERS

if use_rnns:
    rnns = [rnn(hdim, return_sequences=True)]*(num_extra_rnns) + [rnn(hdim, return_sequences=False)]
    rnns = list(map(layers.Bidirectional, rnns)) if use_bidirectional else rnns
    
    body = rnns
    
else:
    body = [
        layers.Dense(32, activation='elu')
    ]


# ADD FINAL LAYERS

body += [
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
]


# CONSTRUCT MODEL

for f in body:
    chain = f(chain)
    
outputs = chain

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 64)     320000      input_2[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, None, 100)    19300       embedding_2[0][0]                
                                                                 embedding_2[1][0]                
____________________________________________________________________________________________

In [24]:
# COMPILE

checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0)


# USE THIS TO RELOAD MODEL
# model.load_weights(checkpoint_path)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
              loss='binary_crossentropy', # categorical_crossentropy for multilabel classification
              metrics=['accuracy'])

# RUN

history = model.fit(
    train_prefetch,
    epochs=epochs,
    validation_data=valid_prefetch, 
    validation_steps=val_steps,
    steps_per_epoch=buffer_size,
    callbacks = [tf.keras.callbacks.ReduceLROnPlateau()] # cp_callback not used
    )


# model.save_weights(checkpoint_path) # save weights

Train for 1000 steps, validate for 312 steps
Epoch 1/2
Epoch 2/2


之前用 GPU 的結果：

    81.28% in 1 min 20 s (1 conv, 1 biLSTM)
    81.35% in 1 min 40 s (1 GRU, 1 biGRU)
    81.46% in 1 min 51 s (1 conv, 2 biGRU)
    81.46% in 2 min 21 s (1 conv, 2 GRU, 1biGRU)
    80.96% in 2 min 58 s (attention)
    81.54% in 3 min 00 s (1 conv, 3 biGRU)
    81.51% in 3 min 26 s (2 GRU, 1 biGRU, no conv)
    81.58% in 3 min 54 s (1 conv, 4 biGRU)
    

In [25]:
# SAVE

checkpoint_path = "training_2/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

model.save_weights(checkpoint_path) # save weights
model.save(checkpoint_path) # save model

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: training_2/cp.ckpt/assets


In [28]:
# PREDICTION FUNCTIONS

# maxlength = 118

def sentiment(num):
    "Converts a float into the corresponding sentiment label"
    if num < 0.40: return 'negative'
    if num > 0.65: return 'positive'
    return 'neutral'


def give_sentiment(sent):
    "Prints given sentences with their predicted sentiments"
    if isinstance(sent, str):
        spaces = len(str)
        s = word2vec([sent])
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        val = model.predict(s)[0]
        res = sentiment(val)
        print("\n")
        print(f"{sent}", " "*(5+spaces-len(sent)), "|", " "*10, f" {res} ({val[0]:.2f})")
        print("\n")
        
    elif isinstance(sent, typing.Iterable):
        spaces = max([len(i) for i in sent])
        s = word2vec(sent)
        s = tf.keras.preprocessing.sequence.pad_sequences(s, maxlen=maxlength, padding='post').astype('uint16')
        vals = [i for i in model.predict(s)]
        res = [sentiment(i[0]) for i in vals]
        
        print("\n")
        for (sentence, result, val) in zip(sent, res, vals):

            print(f"{sentence}", " "*(spaces-len(sentence)), "|", " "*4, f" {result}  ({val[0]:.2f})")
        print("\n")
        
    else:
        raise TypeError
    
    return

In [29]:
phrases = ["My iPad is broken",
           "This iPad is fixed",
           "You're a terrible person",
           "We should water the plants",
           "I'm thirsty", 
           "What time is it?",
           "This cake is delicious",
           "This neural net is simple yet effective"
          ]

give_sentiment(phrases)



My iPad is broken                        |       negative  (0.01)
This iPad is fixed                       |       positive  (0.71)
You're a terrible person                 |       negative  (0.05)
We should water the plants               |       neutral  (0.57)
I'm thirsty                              |       neutral  (0.41)
What time is it?                         |       positive  (0.70)
This cake is delicious                   |       positive  (0.99)
This neural net is simple yet effective  |       positive  (0.84)


