### **Import packages**<h3>
*Important: you have to download the gensim package to run this code*

In [None]:
import numpy as np
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.datasets import imdb
import matplotlib.pyplot as plt

### **Set directories and parameters** <h3>

In [None]:
BASE_DIR = 'C:\\Users\\Enrico\\Desktop\\Projet Innovation\\'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'

MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = 16 #LSTM units
num_dense = np.random.randint(100, 150) #dense units
rate_drop_lstm = 0.15 + np.random.rand() * 0.25 #dropout
rate_drop_dense = 0.15 + np.random.rand() * 0.25 #dropout

act = 'relu' #activation function
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set


### **Index word vectors** <h3>


In [None]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True) #load all the pre-trained vectors from GoogleNews, 3B words
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

training_samples = 20000 
validation_samples = 5000 

sequences_1 = train_data[:12500]
sequences_2= train_data[12500:]
test_sequences_1 = test_data[:12500]
test_sequences_2 = test_data[12500:]


word_index = imdb.get_word_index()
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(test_labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)


### **Prepare embeddings** <h3>

In [3]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

NameError: name 'MAX_NB_WORDS' is not defined

### Sample train/validation data <h3>

In [None]:
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

### Define the model structure <h3>

In [None]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)


### Add class weight <h3>

In [None]:
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

### Train the model <h3>

In [None]:
model = Model(inputs=[sequence_1_input, sequence_2_input],         outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

history = model.fit([data_1_train, data_2_train], labels_train,         validation_data=([data_1_val, data_2_val], labels_val, weight_val),         epochs=6, batch_size=2048, shuffle=True,         class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(history.history['val_loss'])

### Visualization <h3>

In [None]:
history_dict = history.history #history is a History object
history_dict.keys() #[u'acc', u'loss', u'val_acc', u'val_loss']

#plotting training and validation LOSS
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
epochs = [i for i in range(1, len(history_dict['acc']) + 1)]
plt.plot(epochs, loss_values, 'bo', label='Training loss')
plt.plot(epochs, val_loss_values, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

#plotting training and validation ACCURACY
plt.clf()
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']
plt.plot(epochs, acc_values, 'bo', label='Training acc')
plt.plot(epochs, val_acc_values, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()