In [1]:
# Refs:
#    Portions of code borrowed and adapted from the following sources:
#        https://www.kaggle.com/snlpnkj/bidirectional-lstm-keras
#        https://www.onceupondata.com/2019/02/01/keras-text3-cnn-rnn/

# Load libraries
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM, Bidirectional, GlobalMaxPool1D, CuDNNLSTM, concatenate, Flatten
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout, SpatialDropout1D
from keras import optimizers
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

Using TensorFlow backend.


In [2]:
import os
os.getcwd()

'C:\\Users\\Brad\\Desktop\\Keras - GPU'

## Load Data
**Notes:**

Data was previously grouped and saved to these csv files in `baseline_logreg_cluster.ipynb`

In [2]:

train = pd.read_csv('w266_proj/data/train_clust_FINAL.csv')
dev = pd.read_csv('w266_proj/data/dev_clust_FINAL.csv')
test = pd.read_csv('w266_proj/data/test_clust_FINAL.csv')

#train_text = train['prepReviewText']
#dev_text = dev['prepReviewText']


In [4]:
# Use this section to test a single data file that has not yet been split into train/dev/test sets
# Comment out when those types of files already exist

# 2-class labels

#from sklearn.model_selection import train_test_split

#X_train_cl2, X_test_cl2, y_train_cl2, y_test_cl2 = train_test_split(train.prepReviewText,df_cl2.group_z_class, test_size=0.2, \
                                   #random_state=42,stratify=df_cl2.group_z_class)
    
#X_train_cl2, X_test_cl2, y_train_cl2, y_test_cl2 = train_test_split(df_cl2.clean_review,df_cl2.class_2, test_size=0.2, \
                                   #random_state=42,stratify=df_cl2.class_2)

In [6]:
# Count of training examples for each class
train['group_z_class'].value_counts()

0.0    26066
1.0    26066
Name: group_z_class, dtype: int64

In [7]:
# Count of dev examples for each class
dev['group_z_class'].value_counts()

0.0    3008
1.0    3008
Name: group_z_class, dtype: int64

In [8]:
# Count of dev examples for each class
test['group_z_class'].value_counts()

1.0    3066
0.0    3066
Name: group_z_class, dtype: int64

In [3]:
# Prepare data for model

seed = 101 
np.random.seed(seed)

X_train = train['prepReviewText']
X_dev = dev['prepReviewText']
X_test = test['prepReviewText']

# Use this line for a multi-class problem to convert class labels
#y_train = to_categorical(train['most_helpful'])

y_train = train['group_z_class']
y_dev = dev['group_z_class']
y_test = test['group_z_class']


In [10]:
# For a multi-class problem, use this cell to define class numbers
# - Use the variable in the model's final dense/output layer
# - Also use softmax activation to get class probabilities for number of classes in num_classes

#num_classes = train['group_z_class'].nunique()

In [4]:
# Tokenize Text

# Tokenize Text
max_features = 50000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_dev = tokenizer.texts_to_sequences(X_dev)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad examples
#max_words = 200
max_words = 200

X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_dev = sequence.pad_sequences(X_dev, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

print(X_train.shape,X_dev.shape, X_test.shape)

(52132, 200) (6016, 200) (6132, 200)


In [287]:
# pickle tokenizer
import pickle

with open('lstm_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [5]:
# Define Model

def get_model(max_features, embed_dim, embedding_matrix, learning_rate, spat_drop, lstm_units):
    np.random.seed(seed)
    K.clear_session()
    model = Sequential()
    
    # max_features = input_dim (vocab size)
    # embed_dim = output_dim (dense embedding size)
    model.add(Embedding(max_features,
                        embed_dim,
                        input_length=X_train.shape[1],
                        weights=[embedding_matrix],
                        trainable=False))
    
    model.add(SpatialDropout1D(spat_drop))
    
    model.add(Bidirectional(CuDNNLSTM(lstm_units, return_sequences=True)))
    
    model.add(GlobalMaxPooling1D())
    
    model.add(Dense(1, activation='sigmoid')) # output layer
    
    adam = optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    
    print(model.summary())
    return model

In [6]:

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [7]:
# Design embedding matrix
# - Use word vectors when they are available
# - When they aren't, vector will be random

# experiment with various values of max_features

def get_embed_mat(EMBEDDING_FILE, max_features=50000):
    # word vectors
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))
    print('Found %s word vectors.' % len(embeddings_index))

    # embedding matrix
    word_index = tokenizer.word_index
    num_words = min(max_features, len(word_index) + 1)
    all_embs = np.stack(embeddings_index.values()) #for random init
    embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(), 
                                        (num_words, embed_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    max_features = embedding_matrix.shape[0]
    
    return max_features, embedding_matrix

## Build Embedding Matrix

In [8]:

#EMBEDDING_FILE = 'w266_proj/glove/glove.6B/glove.6B.100d.txt'
#EMBEDDING_FILE = 'w266_proj/glove/glove.6B/glove.6B.200d.txt'
#EMBEDDING_FILE = 'w266_proj/glove/glove.6B/glove.6B.300d.txt'

#EMBEDDING_FILE = 'w266_proj/glove/glove.42B.300d/glove.42B.300d.txt'
EMBEDDING_FILE = 'w266_proj/glove/glove.840B.300d/glove.840B.300d.txt'

#embed_dim = 100 #word vector dim
#embed_dim = 200
embed_dim = 300

max_features, embedding_matrix = get_embed_mat(EMBEDDING_FILE)

Found 2196016 word vectors.


## Train Final Model

In [9]:
# train the model

# Best (max acc):
# - 300 dim embeddings
# - 200 max words
# - 64 batch
# - .004 lr
# - 175 lstm
# - 0.5 spatial dropout

# Best (min loss):
# - 300 dim embeddings
# - 200 max words
# - 512 batch
# - .001 lr
# - 225 lstm
# - 0.1 spatial dropout

from keras.callbacks import EarlyStopping, ModelCheckpoint

# Classification problem - early stop on val_acc
#es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
#mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True, verbose=1)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

# If training acc improves, but val_acc decreases = overfitting training data
# Model with highest validation accuracy is the winner

epochs = 100
batch_size = 512 # 64, 128, 256
learning_rates = [.1, .01, .004, .003, .001, .0001, .00001] # .001 default

model = get_model(max_features, embed_dim, embedding_matrix, learning_rates[4], spat_drop=0.1, lstm_units=225)
model.fit(X_train, y_train, validation_data=(X_dev, y_dev),epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[es,mc])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 450)          948600    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 450)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 451       
Total params: 15,949,051
Trainable params: 949,051
Non-trainable params: 15,000,000
___

<keras.callbacks.History at 0x232e4de0588>

## Evaluation of the model: dev set

**Note:**

This has technically already been done through the parameter tuning process, and the training of the final model. It is repeated here to for ease of visually comparing dev set and test set performance.

In [10]:
# load weights
model.load_weights("best_model.h5")

scores = model.evaluate(X_dev, y_dev, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 77.96%


## Evaluation of the model: test set

In [11]:
# Evaluation of the model on test set

from sklearn import metrics

test_predicted_labels = model.predict_classes(X_test)
f1_weighted = metrics.f1_score(y_test, test_predicted_labels, average='weighted')
accuracy = metrics.accuracy_score(y_test, test_predicted_labels)
    
print('LSTM - 2 Class (cluster) Labels FINAL (TEST SET)')
print('-------------\n')
print('Accuracy on test set: {:0.4f}'.format(accuracy))
print('f_1 score (Weighted): {:0.4f}'.format(f1_weighted))


LSTM - 2 Class (cluster) Labels FINAL (TEST SET)
-------------

Accuracy on test set: 0.7684
f_1 score (Weighted): 0.7682


In [12]:
# save final model to disk
model.save("lstm_final_model.h5")

## Prepare Data For Error Analysis
**Notes:**

1. Use this section to store values from the FINAL model run
2. Evaluate those results in `LSTM_error_analysis.ipynb`

In [216]:
# Store best model's predictions; probability of being helpful (class 1)
pred_prob = model.predict_proba(X_test)

In [217]:
# Format predictions and predicted labels for a dataframe
pred_prob_s = pd.Series(pred_prob.flatten())

In [218]:
test_pred_labels_s = pd.Series(test_predicted_labels.flatten())

In [219]:
# Form a dataframe
frame = {'pred_prob': pred_prob_s, 'pred_class': test_pred_labels_s, 'true_class': y_test}
err_analysis = pd.DataFrame(frame)

In [220]:
err_analysis.head()

Unnamed: 0,pred_prob,pred_class,true_class
0,0.026451,0,0.0
1,0.29056,0,1.0
2,0.775318,1,1.0
3,0.947763,1,1.0
4,0.665833,1,1.0


In [221]:
# verify that this dataframe produces same accuracy as model run

metrics.accuracy_score(err_analysis['true_class'], err_analysis['pred_class'])

0.7684279191128506

In [281]:
# save off this df for separate error analysis
# - error analysis conducted in LSTM_error_analysis.ipynb

err_analysis.to_csv('err_analysis.csv')

## LSTM Parameter Tuning
**Notes:**

1. Results stored in output file `lstm_run_results_val_loss.txt`

2. Results viewed/sorted/selected for final model using `lstm_parameter_search_results.ipynb`

3. Best results should be used in the **Train Final Model** cell above


In [13]:
# Random parameter search 

# given 200 words; 300 dim

from keras.callbacks import EarlyStopping, ModelCheckpoint

import random

for i in range(1,100):
    
    print()
    print('Training Model {}'.format(i))
    print()
    
    #es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
    #mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', save_best_only=True, verbose=1)
    mc = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    
    # Define parameters for run
    epochs = 20
    #batch_size = random.randint(32,513)
    batch_size = random.choice([32, 64, 128, 256, 512, 1024])
    lr = random.choice([.001, .002, .003, .004, .005, .01])
    dropout = random.choice([0.1, 0.2, 0.3, 0.4, 0.5])
    #lstm_units = random.randint(50,401)
    lstm_units = random.choice([75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400])
    
    print('batch size: {}'.format(batch_size))
    print('lr: {}'.format(lr))
    print('spatial dropout: {}'.format(dropout))
    print('lstm units: {}'.format(lstm_units))
    
    # Build and fit model for run
    model = get_model(max_features, embed_dim, embedding_matrix, lr, dropout, lstm_units)
    model.fit(X_train, y_train, validation_data=(X_dev, y_dev),epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[es,mc], shuffle=True)
    
    # load weights for best model
    model.load_weights("best_model.h5")

    # save accuracy for best model
    scores = model.evaluate(X_dev, y_dev, verbose=0)
    #acc = scores[1]*100
    loss = scores[0]
    
    # Store run results
    #run_results = '(shuffle) ''accuracy: ' + str(acc) + ", " + 'lr: ' + str(lr) + ", " + 'batch: ' + str(batch_size) + ", " + 'dropout: ' + str(dropout) + ", " + 'lstm units: ' + str(lstm_units) + "\n"
    run_results = '(shuffle) ''val_loss: ' + str(loss) + ", " + 'lr: ' + str(lr) + ", " + 'batch: ' + str(batch_size) + ", " + 'dropout: ' + str(dropout) + ", " + 'lstm units: ' + str(lstm_units) + "\n"
    
    f= open("lstm_run_results_val_loss.txt","a+")
    f.write(run_results)
    f.close()



Training Model 1

batch size: 512
lr: 0.004
spatial dropout: 0.5
lstm units: 150
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 300)          542400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 15,542,701
Trainable params: 542,701
Non-trainable params: 15,000,000
_________________________________________________________________
None
Train on 52132 samples, validate on 6


Epoch 00010: val_loss did not improve from 0.48561
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.48561
Epoch 00011: early stopping

Training Model 6

batch size: 512
lr: 0.01
spatial dropout: 0.3
lstm units: 150
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 300)          542400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 15,542,701
Trainable params: 542,70

Train on 52132 samples, validate on 6016 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.49105, saving model to best_model.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.49105 to 0.48939, saving model to best_model.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.48939
Epoch 4/20

Epoch 00004: val_loss improved from 0.48939 to 0.48843, saving model to best_model.h5
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.48843
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.48843
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.48843
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.48843
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.48843
Epoch 00009: early stopping

Training Model 9

batch size: 128
lr: 0.001
spatial dropout: 0.2
lstm units: 300
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 

Train on 52132 samples, validate on 6016 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.49581, saving model to best_model.h5
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.49581
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.49581
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.49581
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.49581
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.49581
Epoch 00006: early stopping

Training Model 14

batch size: 512
lr: 0.001
spatial dropout: 0.1
lstm units: 225
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None,

Train on 52132 samples, validate on 6016 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.50776, saving model to best_model.h5
Epoch 2/20

Epoch 00002: val_loss did not improve from 0.50776
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.50776
Epoch 4/20

Epoch 00004: val_loss improved from 0.50776 to 0.49924, saving model to best_model.h5
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.49924
Epoch 6/20

Epoch 00006: val_loss improved from 0.49924 to 0.49590, saving model to best_model.h5
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.49590
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.49590
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.49590
Epoch 10/20

Epoch 00010: val_loss improved from 0.49590 to 0.49345, saving model to best_model.h5
Epoch 11/20

Epoch 00011: val_loss did not improve from 0.49345
Epoch 12/20

Epoch 00012: val_loss did not improve from 0.49345
Epoch 13/20

Epoch 00013: val_loss did not improve from 

Epoch 5/20

Epoch 00005: val_loss did not improve from 0.48478
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.48478
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.48478
Epoch 8/20

Epoch 00008: val_loss did not improve from 0.48478
Epoch 9/20

Epoch 00009: val_loss did not improve from 0.48478
Epoch 00009: early stopping

Training Model 19

batch size: 128
lr: 0.001
spatial dropout: 0.2
lstm units: 200
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 400)          803200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 400)      


Epoch 00011: val_loss did not improve from 0.48278
Epoch 12/20

Epoch 00012: val_loss did not improve from 0.48278
Epoch 00012: early stopping

Training Model 24

batch size: 64
lr: 0.01
spatial dropout: 0.4
lstm units: 150
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 300)          542400    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 301       
Total params: 15,542,701
Trainable params: 542,70

Epoch 2/20

Epoch 00002: val_loss improved from 0.48787 to 0.48720, saving model to best_model.h5
Epoch 3/20

Epoch 00003: val_loss did not improve from 0.48720
Epoch 4/20

Epoch 00004: val_loss did not improve from 0.48720
Epoch 5/20

Epoch 00005: val_loss did not improve from 0.48720
Epoch 6/20

Epoch 00006: val_loss did not improve from 0.48720
Epoch 7/20

Epoch 00007: val_loss did not improve from 0.48720
Epoch 00007: early stopping

Training Model 29

batch size: 512
lr: 0.004
spatial dropout: 0.1
lstm units: 275
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 550)          1269400   
______________

ResourceExhaustedError: OOM when allocating tensor with shape[1024,200,750] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node bidirectional_1/concat_2}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[{{node loss/mul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
