### Import packages <h3>

In [2]:
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.models import Sequential
import numpy as np
import matplotlib.pyplot as plt
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os

### Get the training datas<h3>

In [3]:
imdb_dir = r'C:\Users\Enrico\Desktop\Projet Innovation\aclImdb'
train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname), encoding='utf-8',errors='ignore')
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

### Tokenization and parameters set<h3>

In [4]:
maxlen = 500 #cutoff reviews, 400 is good too
training_samples = 5000 #raise to 20000
validation_samples = 1000 #raise to 5000
max_words = 10000 #vocabulary dimension, ok
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen) #Pads sequences to the same length.
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]


Found 88582 unique tokens.
Shape of data tensor: (25000, 500)
Shape of label tensor: (25000,)


### Parse GlOve embedding file <h3>

In [5]:
glove_dir = r'C:\Users\Enrico\Desktop\Projet Innovation\glove.6B'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding='utf-8',errors='ignore') #even 300
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


### Prepare GlOve matrix <h3>

In [6]:
embedding_dim = 200 #number of features, increaseable to 300
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### Model setup <h3>

In [None]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(layers.Dropout(0.5))
model.add(layers.Bidirectional(layers.LSTM(16))) #32, 100 would be best
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()


### Load pretrained word embeddings into the Embedding layer<h3>


In [None]:
model.layers[0].set_weights([embedding_matrix]) 
model.layers[0].trainable = False

### Model launch and  visualization<h3>

In [None]:
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,epochs=6,batch_size=32,validation_data=(x_val, y_val)) 
#increase batch size, epochs should be 10-17
model.save_weights('pre_trained_glove_model.h5')

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

### Tokenize test data <h3>

In [None]:
test_dir = os.path.join(imdb_dir, 'test')
labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())

            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)


### Model evaluation on the test set <h3>

In [None]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)