<a href="https://colab.research.google.com/github/harryahlas/generate-survey-comments/blob/master/seq2seqcomments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate Survey Comments
Builds a sequence to sequence model to create comments resembling responses from employee surveys.  Training data (*training_comments.csv*, stored in my personal Google Drive and available on request) was pulled from multiple online sources, mostly *data.world*. I truncated the comments at 1000 characters to facilitate training.

The model is based on the work of George Pipis, link below.

https://pub.towardsai.net/word-level-text-generation-dd61a5a0313d


#### Mount Drive

In [5]:
# Mount Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


#### Load Modules

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np
import pandas as pd

#### Build Model

In [2]:
tokenizer = Tokenizer()
data = open('training_comments.csv').read()
corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
 token_list = tokenizer.texts_to_sequences([line])[0]
 for i in range(1, len(token_list)):
  n_gram_sequence = token_list[:i+1]
  input_sequences.append(n_gram_sequence)
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 201, 100)          796400    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 201, 300)          301200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 201, 300)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 3982)              402182    
_________________________________________________________________
dense_3 (Dense)              (None, 7964)              31720612  
Total params: 33,380,794
Trainable params: 33,380,794
Non-trainable params: 0
__________________________________________

#### Train Model

In [None]:
history = model.fit(predictors, label, epochs=1, verbose=1)



#### Save Model

In [None]:
model.save('/content/gdrive/My Drive/Development/seq2seqcomments/seq2seq50')
#model_backup = model

#### Load Model from Drive *(Optional)*

In [11]:
from tensorflow import keras
model = keras.models.load_model('/content/gdrive/My Drive/Development/seq2seqcomments/seq2seq50')

#### Function to Predict Words *print_next_words()*

In [7]:
def print_next_words(seed_text,number_of_words_to_predict):
  for _ in range(number_of_words_to_predict):
   token_list = tokenizer.texts_to_sequences([seed_text])[0]
   token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
   predicted = model.predict_classes(token_list, verbose=0)
   output_word = ""
   for word, index in tokenizer.word_index.items():
    if index == predicted:
     output_word = word
     break
   seed_text += " " + output_word
  print(seed_text)

#### Make Predictions

In [12]:
print_next_words("my manager is good at", 30)
print_next_words("I should be paid more.", 30)
print_next_words("The customer service", 30)
print_next_words("My benefits are good but I wish there was better life insurance.", 30)



my manager is good at the city to the city to the city to the city to the city to the city to the city to the city to the city to the city to
I should be paid more. the city to the city to the city to the city to the city to the city to the city to the city to the city to the city to
The customer service and the city to the city to the city to the city to the city to the city to the city to the city to the city to the city
My benefits are good but I wish there was better life insurance. and the city to the city to the city to the city to the city to the city to the city to the city to the city to the city
