In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import pandas as pd
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.metrics import roc_auc_score

import gensim

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [3]:
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [5]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [6]:
train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    seed=42)

Found 25000 files belonging to 2 classes.


In [7]:
a = list(train_ds.as_numpy_iterator())

In [8]:
def get_datframe(dataset):
    full_reviews = []
    full_labels = []
    for entry in dataset:
        reviews = entry[0]
        labels = entry[1]
        for review in reviews:
            full_reviews.append(review)
        for label in labels:
            full_labels.append(label)
    dataframe = pd.DataFrame(data={'reviews': full_reviews, 'labels': full_labels})         
    return dataframe

In [9]:
b = get_datframe(a)

In [10]:
b

Unnamed: 0,reviews,labels
0,"b'""Pandemonium"" is a horror movie spoof that c...",0
1,"b""David Mamet is a very interesting and a very...",0
2,b'Great documentary about the lives of NY fire...,1
3,"b""It's boggles the mind how this movie was nom...",0
4,b'The concept of the legal gray area in Love C...,0
...,...,...
24995,b'An innocent man (Steve Guttenberg) has a one...,0
24996,"b'This is one fine movie, I can watch it any t...",1
24997,b'An ultra-modern house in an affluent neighbo...,1
24998,"b""<br /><br />This movie (not a film -- clearl...",0


In [11]:
b['reviews'] = b['reviews'].astype(str)

In [12]:
def process(data):
    data = data.lower()
    data = re.sub("b'|b\"", '', data)
    data = re.sub('<br />', ' ', data)
    return re.sub('[%s]' % re.escape(string.punctuation), '', data)

In [13]:
vfunc = np.vectorize(process)
b['reviews_proc'] = vfunc(b['reviews'])

In [14]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(b['reviews_proc'], b['labels'])

**tf-idf + LogisticRegression**

In [15]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(b['reviews_proc'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [16]:
lr = linear_model.LogisticRegression(class_weight="balanced").fit(xtrain_tfidf, train_y)

In [17]:
predict_y = lr.predict(xvalid_tfidf)

In [18]:
roc_auc_score(valid_y, predict_y)

0.88976

**count vectorizer + LogisticRegression**

In [19]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(b['reviews_proc'])
xtrain_count_vect =  count_vect.transform(train_x)
xvalid_count_vect =  count_vect.transform(valid_x)

In [20]:
lr = linear_model.LogisticRegression(class_weight="balanced").fit(xtrain_count_vect, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
predict_y = lr.predict(xvalid_count_vect)

In [22]:
roc_auc_score(valid_y, predict_y)

0.8841600000000001

**NN**

In [23]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [24]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [25]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', 
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [26]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [27]:
max_features = 10000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [28]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [29]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [30]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b"Having seen most of Ringo Lam's films, I can say that this is his best film to date, and the most unusual. It's a ancient china period piece cranked full of kick-ass martial arts, where the location of an underground lair full of traps and dungeons plays as big a part as any of the characters. The action is fantastic, the story is tense and entertaining, and the set design is truely memorable. Sadly, Burning Paradise has not been made available on DVD and vhs is next-to-impossible to get your mitts on, even if you near the second biggest china-town in North America (like I do). If you can find it, don't pass it up.", shape=(), dtype=string)
Label pos
Vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[ 253,  105,   88,    5,    1,    1,   94,   10,   68,  131,   12,
          11,    7,   24,  113,   19,    6, 1294,    3,    2,   88, 1603,
          29,    4, 2213, 2669,  840,  413,    1,  375,    5,    1, 1690,
        1741,  114,    2, 1650,  

In [31]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[1287])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[313])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  lovely
 313 --->  american
Vocabulary size: 10000


In [32]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [33]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [34]:
model = tf.keras.Sequential([
  layers.Embedding(len(vectorize_layer.get_vocabulary()), 128, mask_zero=True),
  layers.LSTM(128),  
  layers.Dense(32, activation='relu'),
  layers.Dropout(0.5),  
  layers.Dense(1)])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 1,415,745
Trainable params: 1,415,745
Non-trainable params: 0
_________________________________________________________________


In [35]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [36]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.7954304814338684
Accuracy:  0.8187599778175354


**NN + word2vec**

In [38]:
review_lines = []
lines = b['reviews_proc'].values.tolist()

In [39]:
for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    review_lines.append(tokens)

In [40]:
model = gensim.models.Word2Vec(sentences=review_lines, size = 128, window = 5, workers = 32, min_count = 1)
words = list(model.wv.vocab)
len(words)

107433

In [41]:
model.wv.most_similar('best')

[('funniest', 0.7859445214271545),
 ('finest', 0.742426335811615),
 ('leastinspired', 0.73858642578125),
 ('worst', 0.734485387802124),
 ('greatest', 0.7211679220199585),
 ('favourite', 0.6892876625061035),
 ('weakest', 0.6729232668876648),
 ('favorite', 0.6621621251106262),
 ('scariest', 0.6555851697921753),
 ('winning', 0.6381126642227173)]

In [42]:
filename = 'imdb_w2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [43]:
embeddings_index = {}
f = open(os.path.join('', 'imdb_w2vec.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [44]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

word_index = tokenizer_obj.word_index
revew_pad = pad_sequences(sequences)

sentiment = b['labels'].values

In [45]:
revew_pad.shape

(25000, 1424)

In [46]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 128))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [47]:
embedding_layer = Embedding(num_words, 128, embeddings_initializer=Constant(embedding_matrix), trainable=False)

In [48]:
model = tf.keras.Sequential([
  embedding_layer,
  layers.LSTM(128),  
  layers.Dense(32, activation='relu'),
  layers.Dropout(0.5),  
  layers.Dense(1)])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         13751552  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 13,887,297
Trainable params: 135,745
Non-trainable params: 13,751,552
_________________________________________________________________


In [49]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [50]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.47903972864151
Accuracy:  0.7763199806213379
