In [38]:
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

import bz2
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

%matplotlib inline

In [39]:
filename_train = 'reviews_train.bz2'
filename_test = 'reviews_test.bz2'

In [40]:
def read_texts_and_labels(file, max_lines):
  texts = []
  labels = []
  num_lines=0
  for line in bz2.BZ2File(file):
    x = line.decode("utf-8")
    label, text = x.split(" ", 1)
    #print(text)
    # Convert positive sentiment labels to 1 and negative sentiment labels to 0
    if label == "__label__1":
        label = 0
    elif label == "__label__2":
        label = 1
    texts.append(text.strip())
    labels.append(label)
    num_lines += 1
    if(num_lines >= max_lines): break
  #texts = np.array(texts)
  labels = np.array(labels)
  print(num_lines)
  return texts, labels

max_lines_train = 100000
max_lines_test = 4000
train_texts, train_labels = read_texts_and_labels(filename_train, max_lines_train)
test_texts, test_labels = read_texts_and_labels(filename_test, max_lines_test)

100000
4000


In [41]:
print(train_labels[:30])

[1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 0 1 1 1 0 0 1 0 1 1 0 0 0 0 1]


In [42]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.05)
#print(len(val_labels))
#print(len(train_labels))
print(type(train_texts))

<class 'list'>


In [43]:
with open("train_texts.txt", "w") as f:
    for x in train_texts:
        f.write(str(x) +"\n")
        
#score = []
#with open("file.txt", "r") as f:
 # for line in f:
  #  score.append(int(line.strip()))

In [44]:
num_of_words = 3000 # you may experiment with different numbers
tokenizer = Tokenizer(num_words=num_of_words)
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [45]:
del(train_texts)
del(val_texts)
del(test_texts)

In [46]:
#max_length_of_sequences = 200
max_length_of_sequences = max(len(seq) for seq in train_sequences + val_sequences + test_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=max_length_of_sequences, padding='pre')
val_sequences = pad_sequences(val_sequences, maxlen=max_length_of_sequences, padding='pre')
test_sequences = pad_sequences(test_sequences, maxlen=max_length_of_sequences, padding='pre')
print(max_length_of_sequences)

235


In [47]:
# You may tune these hyperparameters, but implement the model as instructed above.
d1 = 200 #Tuned
d2 = 128 #Tuned

def model_FFN():
  model = keras.Sequential([
  layers.Embedding(input_dim=num_of_words, output_dim=d1, input_length=max_length_of_sequences),
  layers.Flatten(),
  layers.Dense(d2, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])
  return model # implement the model and return the model
    
model1 = model_FFN()
learning_rate = 0.001 #Tuned
model1.compile(
    optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate), #Tuned
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [48]:
history = model1.fit(
    train_sequences,
    train_labels,
    epochs=10,
    validation_data=(val_sequences, val_labels),
    batch_size=128
)
print(history)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<keras.callbacks.History object at 0x7fd5dcd4dd50>


In [49]:
model1.save("Model_NN.h5")

In [50]:
del(model1)

In [51]:
d1 = 200
d2 = 128
d3 = 64 #tuned

def model_GRU():
  model = keras.Sequential([
    layers.Embedding(num_of_words, d1, input_length=max_length_of_sequences),
    layers.Bidirectional(layers.GRU(d2, return_sequences=True)),
    layers.GRU(d3),
    layers.Dense(1, activation='sigmoid')
  ])
  return model

model2 = model_GRU()

In [52]:
model2 = model_GRU()
learning_rate=0.001
model2.compile(
    optimizer='Adam',#tuned
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [53]:
history = model2.fit(
    train_sequences,
    train_labels,
    epochs=10,
    validation_data=(val_sequences, val_labels),
    batch_size=128
)
print(history)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<keras.callbacks.History object at 0x7fd56e836020>


In [54]:
model2.save("Model_GRU.h5")

In [55]:
del(model2)

In [56]:
'''# Implement code here
test_pred_ffn = model1.predict(test_sequences)
test_pred_gru = model2.predict(test_sequences)

test_pred_labels_ffn = np.round(test_pred_ffn)
test_pred_labels_gru = np.round(test_pred_gru)

from sklearn.metrics import accuracy_score, f1_score

acc_ffn = accuracy_score(test_labels, test_pred_labels_ffn)
f1_ffn = f1_score(test_labels, test_pred_labels_ffn)

acc_gru = accuracy_score(test_labels, test_pred_labels_gru)
f1_gru = f1_score(test_labels, test_pred_labels_gru)

print("FFN model accuracy: {:.3f}".format(acc_ffn))
print("FFN model F1 score: {:.3f}".format(f1_ffn))

print("GRU model accuracy: {:.3f}".format(acc_gru))
print("GRU model F1 score: {:.3f}".format(f1_gru))'''


'# Implement code here\ntest_pred_ffn = model1.predict(test_sequences)\ntest_pred_gru = model2.predict(test_sequences)\n\ntest_pred_labels_ffn = np.round(test_pred_ffn)\ntest_pred_labels_gru = np.round(test_pred_gru)\n\nfrom sklearn.metrics import accuracy_score, f1_score\n\nacc_ffn = accuracy_score(test_labels, test_pred_labels_ffn)\nf1_ffn = f1_score(test_labels, test_pred_labels_ffn)\n\nacc_gru = accuracy_score(test_labels, test_pred_labels_gru)\nf1_gru = f1_score(test_labels, test_pred_labels_gru)\n\nprint("FFN model accuracy: {:.3f}".format(acc_ffn))\nprint("FFN model F1 score: {:.3f}".format(f1_ffn))\n\nprint("GRU model accuracy: {:.3f}".format(acc_gru))\nprint("GRU model F1 score: {:.3f}".format(f1_gru))'

In [59]:
test_text = "This is not good"
mytest = tokenizer.texts_to_sequences([test_text])
mytest = pad_sequences(mytest, maxlen=max_length_of_sequences, padding='pre')
new_model_NN = tf.keras.models.load_model('Model_NN.h5')
new_model_GRU = tf.keras.models.load_model('Model_GRU.h5')
print("Output NN", new_model_NN.predict(mytest))
print("Output GRU", new_model_GRU.predict(mytest))

Output NN [[0.01717249]]
Output GRU [[0.0003938]]
