In [52]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.initializers import Constant 

from nltk.tokenize import TreebankWordTokenizer

from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
import os
!echo '{"username":"imrankhan1386","key":"8708dd3c36c4db8c0dc44b3191edfe49"}' > ~/.kaggle/kaggle.json
!kaggle datasets download -d iarunava/imdb-movie-reviews-dataset # api copied from kaggle

Downloading imdb-movie-reviews-dataset.zip to /content
 99% 222M/224M [00:05<00:00, 22.2MB/s]
100% 224M/224M [00:05<00:00, 40.9MB/s]


In [6]:
!unzip /content/imdb-movie-reviews-dataset.zip

Output hidden; open in https://colab.research.google.com to view.

In [0]:
import glob
import os

from random import shuffle

def pre_process_data(filepath):
  positive_path = os.path.join(filepath, 'pos')
  negative_path = os.path.join(filepath, 'neg')
  pos_label = 1
  neg_label = 0
  dataset = []
  
  for filename in glob.glob(os.path.join(positive_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((pos_label, f.read()))

  for filename in glob.glob(os.path.join(negative_path, '*.txt')):
    with open(filename, 'r') as f:
      dataset.append((neg_label, f.read()))

  shuffle(dataset)
  return dataset

dataset = pre_process_data('/content/aclimdb/aclImdb/train') 

In [0]:
def tokenize_data(dataset):
  tokenizer = TreebankWordTokenizer()
  tokenized_data = []
  for val in dataset:
    tokens = tokenizer.tokenize(val[1])
    tokenized_data.append(tokens)
  return tokenized_data

tokenized_data = tokenize_data(dataset)

In [0]:
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [0]:
import gensim
# sentences takes tokenized data for each sentence
model = gensim.models.Word2Vec(sentences = tokenized_data, size = embedding_dims, workers = 4, min_count=1)

In [12]:
model.most_similar('horrible')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.9068940877914429),
 ('awful', 0.8491345643997192),
 ('lame', 0.7565335631370544),
 ('laughable', 0.7483967542648315),
 ('dreadful', 0.7301239967346191),
 ('bad', 0.7247618436813354),
 ('atrocious', 0.7222329378128052),
 ('ridiculous', 0.7206465005874634),
 ('stupid', 0.7099730968475342),
 ('lousy', 0.6921871900558472)]

In [13]:
# Save the model
filename = '/content/drive/My Drive/Saved Models/imdb_embeddings_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
import os
embeddings_index = {}
f = open(os.path.join('/content/drive/My Drive/Saved Models/', 'imdb_embeddings_word2vec.txt'), encoding = 'utf-8')
for line in f:
  values = line.split()
  word = values[0]
  coef = np.asarray(values[1:])
  embeddings_index[word] = coef
f.close()

In [0]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(sample[1] for sample in dataset)
sequences = tokenizer_obj.texts_to_sequences(sample[1] for sample in dataset)
sentiment = [sample[0] for sample in dataset]

In [0]:
# Pad sequences
word_index = tokenizer_obj.word_index
review_pad = pad_sequences(sequences , maxlen = maxlen)

In [0]:
num_words = len(word_index)+1 #Vocabulary
embedding_matrix = np.zeros((num_words, embedding_dims))

for word,i in word_index.items():
  if i > num_words:
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [39]:
print('Length of word_index : {}'.format(len(word_index)))
print('Shape of review_pad : {}'.format(review_pad.shape))
print('Length of ground Truth: {}'.format(len(sentiment)))
print('Shape of embedding_matrix : {}'.format(embedding_matrix.shape))

Length of word_index : 88582
Shape of review_pad : (25000, 400)
Length of ground Truth: 25000
Shape of embedding_matrix : (88583, 300)


In [0]:
# Train Test split
split_point = int(len(review_pad)*.8)

X_train = review_pad[:split_point]
y_train = sentiment[:split_point]

X_test = review_pad[split_point:]
y_test = sentiment[split_point:]

In [42]:
print('Build model...')
model = Sequential()
embedding_layer = Embedding(num_words,
                           embedding_dims, 
                           embeddings_initializer = Constant(embedding_matrix),
                           input_length = 400,
                           trainable = False)
model.add(embedding_layer)
model.add(Conv1D(
filters,
kernel_size,
padding='valid',
activation='relu',
strides=1,
input_shape=(maxlen, embedding_dims)))

model.add(GlobalMaxPooling1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

Build model...




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [43]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [44]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 300)          26574900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                

In [46]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_data=(X_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0d14d572e8>

In [0]:
model_structure = model.to_json()
with open("/content/drive/My Drive/Saved Models/NLPUsingCNN_model.json", "w") as json_file:
  json_file.write(model_structure)
model.save_weights("/content/drive/My Drive/Saved Models/NLPUsingCNN_weights.h5")

In [5]:
# Loading a saved model

from keras.models import model_from_json
with open("/content/drive/My Drive/Saved Models/NLPUsingCNN_model.json", "r") as json_file:
  json_string = json_file.read()
model = model_from_json(json_string)
model.load_weights('/content/drive/My Drive/Saved Models/NLPUsingCNN_weights.h5')





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [0]:
sample_1 = ["I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."]

In [0]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(sample_1)
sequences = tokenizer_obj.texts_to_sequences(sample_1)

In [0]:
# Pad sequences
word_index = tokenizer_obj.word_index
review_pad = pad_sequences(sequences , maxlen = maxlen)

In [41]:
review_pad.shape

(1, 400)

In [42]:
model.predict_classes(review_pad)

array([[1]], dtype=int32)