# Purpose: 
LSTMS for office

In [1]:
import pandas as pd
import re 
import numpy as np 
import spacy 
# from wordcloud import WordCloud
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Doc
import re
from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import LSTM, Dense,Embedding, Input, Dropout,BatchNormalization, Bidirectional
from tensorflow import keras
from tensorflow.keras import backend as be
from tensorflow.keras import Sequential, Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
tf.random.set_seed(42)
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from kerastuner.tuners import RandomSearch

2023-05-22 10:35:01.938852: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# loading the pre-trained embedding file - glove.6B.100d.txt into embeddings_index matrix 
print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [3]:
def get_train_test_split(df, stratify_col, test_size=0.3, random_state=42): 
    train, test_df = train_test_split(df, test_size= test_size, random_state=random_state, stratify= stratify_col)
    return train, test_df

def load_glove_embedding(glove_path): 
    f = open(glove_path, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))
    return embeddings_index


def create_lstm_train_data(lstm_train_df,lstm_test_df, text_col_name, label_col_name): 
    X_train =lstm_train_df[text_col_name]
    y_train= lstm_train_df[label_col_name]
    X_test= lstm_test_df[text_col_name]
    y_test= lstm_test_df[label_col_name]
    #set up the tokenizer
    # Encoding the reviews using Tokenizer from tensorflow preprocessing 
    #set up the tokenizer
    MAX_VOCAB_SIZE = 15000 # check what vocab size to use here
    # Set up tokenizer and keep max_vocab_size based on the word frequency, oov_token is given to be added to the word_index during the text_to_sequences call
    tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE,oov_token="<oov>", lower=True)
    tokenizer.fit_on_texts(X_train)
    # determination of the  training data vocabulary size
    vocab_index = tokenizer.word_index
    #print(word_index)
    V = len(vocab_index)
    ##create sequences of reviews
    # transformation based on text present in the reviews to sequence of integers
    X_seq_train = tokenizer.texts_to_sequences(X_train)
    X_seq_test =  tokenizer.texts_to_sequences(X_test)
    #create padded sequences
    # experimented with the max length and embedding size to get better performance metrics 
    maxlen= 512
    embed_size = 100
    # perform padding to the same length:
    # Taking default value of pre in padding, for padding 
    # Truncating: removal of values larger than maxlen, used post. 
    pad_train=pad_sequences(X_seq_train,truncating = 'post', padding = 'pre',maxlen=512)
    pad_test=pad_sequences(X_seq_test,truncating = 'post', padding = 'pre',maxlen=512)
    embedding_matrix = create_embedding_matrix(tokenizer.word_index, V)
    return pad_train, pad_test, y_train,y_test, V, embedding_matrix


def create_embedding_matrix( word_index, vocab_length): 
    embedding_dim = 100 # GloVe contains 100-dimensional embedding vectors for 400.000 words
    word_index = word_index
    embedding_matrix = np.zeros((vocab_length+1, embedding_dim)) # embedding_matrix.shape (10000, 100)
    embeddings_index = load_glove_embedding('glove.6B.100d.txt')
    #get corresponding word embeddings/weights for imdb review corpus amd create embedding_matrix
    for word, i in word_index.items():
        if i < vocab_length:
            embedding_vector = embeddings_index.get(word) # embedding_vector.shape (100,)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [2]:

def build_model(hp):
    # takes input with shape as specified
    input_ = Input(shape=(512,))
    
    # creation of the embedding layer and setting pre-trained word weights and trainable parameter as false
    embeds = Embedding(V+1, 100, input_length=512, weights=[embedding_matrix], trainable=False)(input_)
    
    # adding dropout for regularization
    drop = Dropout(hp.Float('dense_dropout', min_value=0.1, max_value=0.5, step=0.1))(embeds)
    
    # Get the choice of dropout rate for LSTM
    lstm_dropout = hp.Float('lstm_dropout', min_value=0.1, max_value=0.5, step=0.1)
    
    # Creation of LSTM layer with variable number of units specified by the hyperparameter 'lstm_units'
    lstm_1 = LSTM(units=hp.Int('lstm_units', min_value=100, max_value=500, step=50),
                  dropout=lstm_dropout)(drop)
    
    # Get the choice of dropout rate for dense layer
    dense_dropout = hp.Float('dense_dropout', min_value=0.1, max_value=0.5, step=0.1)
    
    # Add a fully connected dense layer to combine and weight the features extracted from LSTM
    # Treating the output from LSTM as a regression problem
    weight_dense = Dense(units=256, activation='relu')(lstm_1)
    
    # Added dropout for further regularization
    drop_2 = Dropout(dense_dropout)(weight_dense)
    
    # Output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    
    # Create the model instance
    model = Model(inputs=input_, outputs=output_layer)
    
    # Compile the model
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        metrics=['accuracy']
    )
    
    return model

# Define the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
   executions_per_trial=1,
    directory='my_dir',
    project_name='my_project'
)

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Perform the hyperparameter search with early stopping callback, run this after initiaising the train variables from below
tuner.search(pad_train,y_train, validation_split=0.2, epochs=10, callbacks=[early_stopping])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=25)[0]

# Build the model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)


# Disagreement With Ratings:

In [4]:
# read the file: 
disagreement_with_ratings_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/xgboost_LSTMs/office/disagreement_with_ratings (1).csv')
# train test split for disagreement with ratings df: 
train, test_df = get_train_test_split(disagreement_with_ratings_df,stratify_col=disagreement_with_ratings_df['disagreement_with_ratings_ohe'])


pad_train, pad_test, y_train,y_test, V, embedding_matrix= create_lstm_train_data(train,
                                                                                  test_df, 
                                                                                 'full_review',
                                                                                 'disagreement_with_ratings_ohe'
                                                                                 )

Found 400000 word vectors.


In [6]:
be.clear_session()
from keras.layers import Reshape
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
    # drop = tf.keras.layers.Dropout(0.3)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(350)(embeds)
    reshaped_input = Reshape((1, 350))(lstm_1)
    lstm_2 = LSTM(312)(reshaped_input)
#     lstm_2 = LSTM(256)(lstm_1)
    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=256, activation='relu')(lstm_2)
    # added dropout for further regularization
    # drop_2 = Dropout(0.5)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(weight_dense)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

2023-05-22 10:35:59.195854: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-22 10:35:59.196850: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-22 10:35:59.197429: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1196200   
                                                                 
 lstm (LSTM)                 (None, 350)               631400    
                                                                 
 reshape (Reshape)           (None, 1, 350)            0         
                                                                 
 lstm_1 (LSTM)               (None, 312)               827424    
                                                                 
 dense (Dense)               (None, 256)               80128     
                                                                 
 dense_1 (Dense)             (None, 1)                 257   

In [136]:
be.clear_session()
from keras.layers import Reshape
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
#     drop = tf.keras.layers.Dropout(0.01)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(350)(embeds)
    reshaped_input = Reshape((1, 350))(lstm_1)
    lstm_2 = LSTM(256)(reshaped_input)
#     lstm_4 = LSTM(256)(lstm_3)
#     lstm_5= LSTM(256)(lstm_4)
    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=1024, activation='relu')(lstm_2)
    # added dropout for further regularization
    drop_2 = Dropout(0.5)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

2023-05-20 21:26:36.106402: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:26:36.107654: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:26:36.108362: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1196200   
                                                                 
 lstm (LSTM)                 (None, 350)               631400    
                                                                 
 reshape (Reshape)           (None, 1, 350)            0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               621568    
                                                                 
 dense (Dense)               (None, 1024)              263168    
                                                                 
 dropout (Dropout)           (None, 1024)              0     

In [7]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 4 )
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.2,  
                  epochs = 100, 
                  batch_size = 128, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-22 10:36:15.233803: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-22 10:36:15.235147: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-22 10:36:15.235993: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-22 10:36:29.014026: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-22 10:36:29.015016: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-22 10:36:29.015683: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [8]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-22 10:37:54.591716: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-22 10:37:54.592623: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-22 10:37:54.593279: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

 1/42 [..............................] - ETA: 20s

2023-05-22 10:37:54.813734: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-05-22 10:37:54.891218: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98      1335
         1.0       0.00      0.00      0.00         0

    accuracy                           0.96      1335
   macro avg       0.50      0.48      0.49      1335
weighted avg       1.00      0.96      0.98      1335



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Rating Management Explicit

In [66]:
# read the file: 
rating_management_explicit_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/xgboost_LSTMs/general_training_data/rating_managment_explicit_general.csv')
# train test split for disagreement with ratings df: 
train, test_df = get_train_test_split(rating_management_explicit_df,stratify_col=rating_management_explicit_df['rating_managment_explicit_ohe'])


pad_train, pad_test, y_train,y_test, V, embedding_matrix= create_lstm_train_data(train,
                                                                                  test_df, 
                                                                                 'full_review',
                                                                                 'rating_managment_explicit_ohe'
                                                                                 )


Found 400000 word vectors.


In [70]:
be.clear_session()
from keras.layers import TimeDistributed


with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # Creation of lstm layer with 350 as dimension of the output space.
    drop = tf.keras.layers.Dropout(0.01)(embeds)
#     lstm_1 = LSTM(550)(drop)
#     reshaped_input = Reshape((1, 550))(lstm_1)
#     lstm_2 = LSTM(256, return_sequences=True)(reshaped_input)
#     lstm_3 = LSTM(512)(lstm_2)
#     lstm_4 = LSTM(256)(lstm_3)
    lstm_1 = LSTM(550)(drop)
    reshaped_input = Reshape((1, 550))(lstm_1)
    lstm_2 = LSTM(256, return_sequences=True)(reshaped_input)
    lstm_3 = LSTM(512, return_sequences=True)(lstm_2)  # Set return_sequences=False for the final LSTM layer
    lstm_4 = LSTM(256)(lstm_3)
  

    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=512, activation='relu')(lstm_4)
    # added dropout for further regularization
    drop_2 = Dropout(0.5)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

2023-05-20 20:42:52.204519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:42:52.205613: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:42:52.206296: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          3466900   
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 550)               1432200   
                                                                 
 reshape (Reshape)           (None, 1, 550)            0         


2023-05-20 20:42:52.629041: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:42:52.630122: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:42:52.630809: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

                                                                 
 lstm_1 (LSTM)               (None, 1, 256)            826368    
                                                                 
 lstm_2 (LSTM)               (None, 1, 512)            1574912   
                                                                 
 lstm_3 (LSTM)               (None, 256)               787456    
                                                                 
 dense (Dense)               (None, 512)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513       
                                                                 
Total params: 8,219,933
Trainable params: 4,753,033
Non-trainable params: 3,466,900
__________________________________________________________

In [71]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 3)
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.1,  
                  epochs = 100, 
                  batch_size = 64, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-20 20:43:02.209972: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:43:02.210957: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:43:02.211667: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-20 20:43:50.150488: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:43:50.151653: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:43:50.152378: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [72]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-20 20:48:38.696942: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:48:38.698035: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:48:38.698723: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

         0.0       1.00      0.98      0.99      3369
         1.0       0.00      0.00      0.00         0

    accuracy                           0.98      3369
   macro avg       0.50      0.49      0.50      3369
weighted avg       1.00      0.98      0.99      3369



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Wish I read Reviews:


In [74]:
read_reviews_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/xgboost_LSTMs/office/read_reviews_v1.csv')

# train test split for disagreement with ratings df: 
train, test_df = get_train_test_split(read_reviews_df,stratify_col=read_reviews_df['wish I read reviews _ohe'])


pad_train, pad_test, y_train,y_test, V, embedding_matrix= create_lstm_train_data(train,
                                                                                test_df, 
                                                                                'full_review',
                                                                                'wish I read reviews _ohe'
                                                                                 )


Found 400000 word vectors.


In [78]:
be.clear_session()
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
    drop = tf.keras.layers.Dropout(0.3)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(550)(drop)
      # Set return_sequences=False for the final LSTM layer

    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=512, activation='relu')(lstm_1)
    # added dropout for further regularization
    drop_2 = Dropout(0.5)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

2023-05-20 20:53:44.246993: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:53:44.248080: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:53:44.248832: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1286600   
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 550)               1432200   
                                                                 
 dense (Dense)               (None, 512)               282112    
                                                                 
 dropout_1 (Dropout)         (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 513   

2023-05-20 20:53:44.680538: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:53:44.681514: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:53:44.682202: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [79]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 4 )
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.1,  
                  epochs = 100, 
                  batch_size = 64, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-20 20:53:47.435430: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:53:47.436548: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:53:47.437319: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-20 20:54:06.024310: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:54:06.025380: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:54:06.026097: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


In [80]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-20 20:56:34.957912: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 20:56:34.958941: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 20:56:34.959685: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

         0.0       1.00      0.93      0.96      1421
         1.0       0.12      0.78      0.21        18

    accuracy                           0.93      1439
   macro avg       0.56      0.85      0.59      1439
weighted avg       0.99      0.93      0.95      1439



# Wrong Buying:

In [94]:
wrong_buying_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/xgboost_LSTMs/office/wrong_buying_v1.csv')

# train test split for disagreement with ratings df: 
train, test_df = get_train_test_split(wrong_buying_df,stratify_col=wrong_buying_df['wrong_buying_ohe'])


pad_train, pad_test, y_train,y_test, V, embedding_matrix= create_lstm_train_data(train,
                                                                                test_df, 
                                                                                'full_review',
                                                                                'wrong_buying_ohe'
                                                                                 )


Found 400000 word vectors.


In [101]:
be.clear_session()
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
    drop = tf.keras.layers.Dropout(0.01)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(350)(drop)
#     reshaped_input = Reshape((1, 350))(lstm_1)
#     lstm_2 = LSTM(256, return_sequences=True)(reshaped_input)
#     lstm_3 = LSTM(512)(lstm_2)
    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=256, activation='relu')(lstm_1)
    # added dropout for further regularization
    drop_2 = Dropout(0.01)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1191200   
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 350)               631400    
                                                                 
 dense (Dense)               (None, 256)               89856     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257   

2023-05-20 21:05:37.400037: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:05:37.401122: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:05:37.401784: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [102]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 4 )
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.1,  
                  epochs = 100, 
                  batch_size = 64, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-20 21:05:38.515766: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:05:38.516750: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:05:38.517502: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-20 21:05:49.174699: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:05:49.175670: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:05:49.176351: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [103]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-20 21:06:36.006247: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:06:36.007406: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:06:36.007996: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

         0.0       1.00      0.90      0.95      1333
         1.0       0.01      0.50      0.01         2

    accuracy                           0.90      1335
   macro avg       0.50      0.70      0.48      1335
weighted avg       1.00      0.90      0.94      1335



In [131]:
be.clear_session()
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
    drop = tf.keras.layers.Dropout(0.0001)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(350)(drop)
    reshaped_input = Reshape((1, 350))(lstm_1)
    lstm_2 = LSTM(256)(reshaped_input)
#     lstm_3 = LSTM(512)(lstm_2)
    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=256, activation='relu')(lstm_2)
    # added dropout for further regularization
    # drop_2 = Dropout(0.1)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(weight_dense)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

2023-05-20 21:22:50.706869: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:22:50.707855: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:22:50.708702: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1190300   
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 350)               631400    
                                                                 
 reshape (Reshape)           (None, 1, 350)            0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               621568    
                                                                 
 dense (Dense)               (None, 256)               65792 

In [132]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 3)
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.1,  
                  epochs = 100, 
                  batch_size = 64, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-20 21:22:51.180073: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:22:51.181164: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:22:51.181930: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-20 21:23:04.949950: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:23:04.950906: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:23:04.951724: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [133]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-20 21:24:33.665250: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:24:33.666421: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:24:33.667099: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

         0.0       0.99      0.88      0.93      1291
         1.0       0.19      0.86      0.32        44

    accuracy                           0.88      1335
   macro avg       0.59      0.87      0.63      1335
weighted avg       0.97      0.88      0.91      1335



# Zero Stars:

In [121]:
zero_stars_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/xgboost_LSTMs/office/zero_star_v1.csv')
# train test split for disagreement with ratings df: 
train, test_df = get_train_test_split(zero_stars_df,stratify_col=zero_stars_df['zero_star_ohe'])


pad_train, pad_test, y_train,y_test, V, embedding_matrix= create_lstm_train_data(train,
                                                                                test_df, 
                                                                                'full_review',
                                                                                'zero_star_ohe'
                                                                                 )


Found 400000 word vectors.


In [122]:
be.clear_session()
with tf.device('/GPU:0'):
     # takes input with shape as specified,   
    input_ = Input(shape=(512,))
    # creation of the embedding layer and setting pre-trained word weights and trainable
    # parameter as false
    embeds = Embedding(V+1, 100,input_length = 512, weights=[embedding_matrix], trainable = False)(input_)
    # adding dropout for regularization
    drop = tf.keras.layers.Dropout(0.3)(embeds)
    # Creation of lstm layer with 350 as dimension of the output space.
    lstm_1 = LSTM(350)(drop)
    # add fully connected dense layer to combine and weight the features that are extracted from LSTM. Addition
    # of the dense layer allows for better regularization as well by the addition of parameters. 
    # Treating the output from lstm as a regression problem
    weight_dense = Dense(units=256, activation='relu')(lstm_1)
    # added dropout for further regularization
    drop_2 = Dropout(0.5)(weight_dense)
    # output layer with sigmoid activation for binary classification
    output_layer = Dense(1, activation='sigmoid')(drop_2)
    # Finally, create a model instance!
    model = Model(inputs = input_, outputs = output_layer)
    model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding (Embedding)       (None, 512, 100)          1190300   
                                                                 
 dropout (Dropout)           (None, 512, 100)          0         
                                                                 
 lstm (LSTM)                 (None, 350)               631400    
                                                                 
 dense (Dense)               (None, 256)               89856     
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 257   

2023-05-20 21:18:17.755436: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:18:17.756604: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:18:17.757292: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [123]:
# monitoring validation loss for early stopping to avoid overfitting with patience of 4
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', 
                                mode='min', 
                               patience = 4 )
# utilization of adam optimization
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# use gpu and fit model, considered a validation_split of 10% from the training dataset
with tf.device('/GPU:0'):

    lstm_model_glove = model.fit(pad_train,y_train, 
                    validation_split= 0.1,  
                  epochs = 100, 
                  batch_size = 64, 
                  callbacks =[early_stop])

Epoch 1/100


2023-05-20 21:18:20.502222: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:18:20.503419: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:18:20.504503: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-05-20 21:18:31.878805: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:18:31.879875: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:18:31.880467: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


In [124]:
# make predictions on the test data: 
y_pred = model.predict(pad_test)
y_pred = np.round(y_pred)
print(classification_report(y_pred, y_test))

2023-05-20 21:19:41.804416: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-20 21:19:41.805386: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-20 21:19:41.806147: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

              precision    recall  f1-score   support

         0.0       0.99      0.95      0.97      1188
         1.0       0.68      0.90      0.77       147

    accuracy                           0.94      1335
   macro avg       0.83      0.92      0.87      1335
weighted avg       0.95      0.94      0.95      1335



In [125]:
print('End of Notebook!')

End of Notebook!
