# Bag of Words meets Bag of Popcorn
A Kaggle Competition using Google's Word2Vec for movie reviews

# Data Set
The labeled data set consists of 50,000 IMDB movie reviews, specially selected for sentiment analysis. The sentiment of reviews is binary, meaning the IMDB rating < 5 results in a sentiment score of 0, and rating >=7 have a sentiment score of 1. No individual movie has more than 30 reviews. The 25,000 review labeled training set does not include any of the same movies as the 25,000 review test set. In addition, there are another 50,000 IMDB reviews provided without any rating labels.

To train the word2vec, we use all available data sets and combine to form a big enough corpus. Also, to increase our accuracy, we use an additional augmented dataset with a total of 49,912 reviews which also contains the original 25,000 train data set.

First, we should make sure that no entry in the test data set is in the augmented data set. This is confirmed by the code below: 

In [1]:
import pandas as pd
testdf = pd.read_csv('testData.tsv',sep="\t")
traindf = pd.read_csv('augmented_traindata_clean2.csv')
print(traindf.info())
mergedf = pd.merge(testdf,traindf,on=['review'],how='inner')
print(mergedf.info())
print(mergedf.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49912 entries, 0 to 49911
Data columns (total 3 columns):
sentiment    49912 non-null int64
review       49912 non-null object
type         49912 non-null object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 4 columns):
id           0 non-null object
review       0 non-null object
sentiment    0 non-null int64
type         0 non-null object
dtypes: int64(1), object(3)
memory usage: 0.0+ bytes
None
Empty DataFrame
Columns: [id, review, sentiment, type]
Index: []


Next, we combine all the datasets into one review corpus which we'll use only for the Word2vec model. 

In [2]:
import pandas as pd
pd.options.display.max_colwidth = 100
traindf=[]
traindf = pd.read_csv('augmented_traindata_clean2.csv')
#traindf = traindf.drop(['id'], axis=1)
traindf['type'] = 'train'
addtldf=[]
addtldf = pd.read_csv('unlabeledTrainData.tsv',error_bad_lines=False,delimiter="\t")
addtldf = addtldf.drop(['id'], axis=1)
addtldf['type'] = 'addtl'
test=[]
test = pd.read_csv('testData.tsv',delimiter="\t")
test['type'] = 'test'
revcorpus = pd.concat([traindf,addtldf,test],ignore_index=True,sort=False)
print("Number of reviews: ",len(revcorpus))


Number of reviews:  124912


# Data Preprocessing
We perform data cleaning on the reviews by removing tags, removing stopwords, lemmatization, and so on. 

In [3]:
import re
import string
# import the inflect library 
import inflect 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import PorterStemmer 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams,bigrams
from nltk.tokenize import sent_tokenize
from nltk import download,set_proxy
#download('punkt')
#download('stopwords')
#download('wordnet')
stemmer = PorterStemmer() 
p = inflect.engine() 
stop_words = set(stopwords.words("english")) 
# extract pure text from html
def remove_tags(text):
    try:
        text = re.sub(r'<[^<>]+>', " ", text)
    except:
        print(text)
    return text
# remove stopwords function 
def remove_stopwords(text): 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    filtered_text = " ".join(filtered_text)
    return filtered_text 

def text_lowercase(text): 
    return text.lower() 
  
# convert number into words 
def convert_number(text): 
    # split string into list of words 
    temp_str = text.split() 
    # initialise empty list 
    new_string = [] 
  
    for word in temp_str: 
        # if word is a digit, convert the digit 
        # to numbers and append into the new_string list 
        if word.isdigit(): 
            temp = p.number_to_words(word) 
            new_string.append(temp) 
  
        # append the word as it is 
        else: 
            new_string.append(word) 
  
    # join the words of new_string to form a string 
    temp_str = ' '.join(new_string) 
    return temp_str

# remove punctuation 
def remove_punctuation(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator)

    
# remove whitespace from text 
def remove_whitespace(text): 
    return  " ".join(text.split()) 

lemmatizer = WordNetLemmatizer() 
# lemmatize string 
def lemmatize_word(text): 
    word_tokens = word_tokenize(text) 
    # provide context i.e. part-of-speech 
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
    #lemmas = " ".join(lemmas)
    return lemmas 
# stem words in the list of tokenised words 
def stem_words(text): 
    word_tokens = word_tokenize(text) 
    stems = [stemmer.stem(word) for word in word_tokens] 
    stems = " ".join(stems)
    return stems 

def generate_ngram(text,n):
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    tokens = word_tokenize(text) 
    output = list(ngrams(tokens, n))
    #output = bigrams(tokens)
    return output

def preprocess_text(text):
    text = remove_tags(text)
    text = convert_number(text)
    text = text_lowercase(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = remove_whitespace(text)
    #text = stem_words(text)
    #text = ngram_vector(text)
    #print(text)
    text = lemmatize_word(text)
    return text
#revcorpus = revcorpus[1:5]
revcorpus['processed_review'] = revcorpus.review.apply(lambda x: preprocess_text(x))  
#revcorpus['bigrams'] = revcorpus.processed_review.apply(lambda x:generate_ngram(x,2))
print("Number of reviews: ",len(revcorpus))
#print(revcorpus.head(3))
#traindf['processed_review'] = traindf.review.apply(lambda x: preprocess_text(x))  
#print("Number of reviews: ",len(traindf))
#traindf.head(3)

Number of reviews:  124912


In [4]:
%%time
from gensim.models import Phrases
reviews = revcorpus['processed_review']
reviews.head()
bigrams = Phrases(sentences=reviews)
trigrams = Phrases(sentences=bigrams[reviews])

Wall time: 2min 3s


In [5]:
print(bigrams[reviews][2])
print(trigrams[bigrams[reviews]][2])

['must', 'assume', 'praise', 'film', 'greatest', 'film', 'opera', 'ever', 'didnt', 'read_somewhere', 'either', 'dont_care', 'opera', 'dont_care', 'wagner', 'dont_care', 'anything', 'except', 'desire', 'appear', 'culture', 'either', 'representation', 'wagners', 'swansong', 'movie', 'strike', 'unmitigated_disaster', 'leaden', 'read', 'score', 'match', 'tricksy', 'lugubrious', 'realisation', 'text', 'questionable', 'people', 'ideas', 'opera', 'matter', 'play', 'especially', 'one', 'shakespeare', 'allow', 'anywhere_near', 'theatre', 'film', 'studio', 'syberberg', 'fashionably', 'without', 'smallest', 'justification', 'wagners', 'text', 'decide', 'parsifal', 'bisexual', 'integration', 'title', 'character', 'latter_stag', 'transmute', 'kind', 'beatnik', 'babe', 'though', 'one', 'continue', 'sing', 'high', 'tenor', 'actors', 'film', 'singers', 'get', 'double_dose', 'armin_jordan', 'conductor', 'see', 'face', 'hear', 'voice', 'amfortas', 'also', 'appear', 'monstrously', 'double_exposure', 'kin

In [6]:
%%time
#Word2Vec MODEL
from gensim.models import Word2Vec
embedding_vector_size = 300
#tokens = word_tokenize(bigrams[revcorpus['processed_review']])
wordvec_model = Word2Vec(
    sentences = reviews,
    size = embedding_vector_size,
    min_count=3, window=5, workers=4)

Wall time: 1min 50s


In [7]:
pd.options.display.max_colwidth = 100
print("Vocabulary size:", len(wordvec_model.wv.vocab))
print("Number of reviews: ",len(traindf))
#for word in wordvec_model.wv.vocab:
#    print((word, wordvec_model.wv.vocab[word].count))
wordvec_model.wv.most_similar("great")

Vocabulary size: 82653
Number of reviews:  49912


[('wonderful', 0.6779197454452515),
 ('fantastic', 0.6756469011306763),
 ('excellent', 0.6671639084815979),
 ('terrific', 0.6359677314758301),
 ('good', 0.6292046308517456),
 ('awesome', 0.5692603588104248),
 ('fine', 0.5664957165718079),
 ('outstanding', 0.5593339204788208),
 ('superb', 0.5544353127479553),
 ('fabulous', 0.5517513155937195)]

In [8]:
train_reviews = revcorpus[revcorpus['type']=="train"]["processed_review"]
#train_reviews = trigrams[bigrams[train_reviews]]
#train_reviews = [" ".join(text) for text in [review for review in train_reviews]]
print(len(train_reviews))

49912


In [9]:
 train_reviews[1]

['film',
 'start',
 'manager',
 'nicholas',
 'bell',
 'give',
 'welcome',
 'investors',
 'robert',
 'carradine',
 'primal',
 'park',
 'secret',
 'project',
 'mutate',
 'primal',
 'animal',
 'use',
 'fossilize',
 'dna',
 'like',
 '¨jurassik',
 'park¨',
 'scientists',
 'resurrect',
 'one',
 'natures',
 'fearsome',
 'predators',
 'sabretooth',
 'tiger',
 'smilodon',
 'scientific',
 'ambition',
 'turn',
 'deadly',
 'however',
 'high',
 'voltage',
 'fence',
 'open',
 'creature',
 'escape',
 'begin',
 'savagely',
 'stalk',
 'prey',
 'human',
 'visitors',
 'tourists',
 'scientificmeanwhile',
 'youngsters',
 'enter',
 'restrict',
 'area',
 'security',
 'center',
 'attack',
 'pack',
 'large',
 'prehistorical',
 'animals',
 'deadlier',
 'bigger',
 'addition',
 'security',
 'agent',
 'stacy',
 'haiduk',
 'mate',
 'brian',
 'wimmer',
 'fight',
 'hardly',
 'carnivorous',
 'smilodons',
 'sabretooths',
 'course',
 'real',
 'star',
 'star',
 'astound',
 'terrifyingly',
 'though',
 'convince',
 'giant'

### Vectorization 
Converting the words to vectors (Code grabbed from https://www.kaggle.com/alexcherniuk/imdb-review-word2vec-bilstm-99-acc)

In [10]:
%%time
from keras.preprocessing.sequence import pad_sequences
def vectorize_data(data, vocab: dict) -> list:
    print('Vectorize sentences...', end='\r')
    keys = list(vocab.keys())
    filter_unknown = lambda word: vocab.get(word, None) is not None
    encode = lambda review: list(map(keys.index, filter(filter_unknown, review)))
    vectorized = list(map(encode, data))
    print('Vectorize sentences... (done)')
    return vectorized
embed_size = 150
X_pad = pad_sequences(
    sequences=vectorize_data(train_reviews, vocab=wordvec_model.wv.vocab),
    maxlen=embed_size,
    padding='post')

Using TensorFlow backend.


Vectorize sentences... (done)
Wall time: 14min 31s


In [11]:
X_pad[1]

array([132, 133, 134, 135, 136, 136, 137, 138, 139, 140, 141, 120, 104,
       105, 106, 142, 143, 144, 130,  85,  86,  87,  88, 145, 146,  89,
       147, 148, 105, 149,  41, 150, 151, 152, 153, 154, 155, 156, 157,
       133, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       141, 120, 170,  59, 171, 172, 173, 174, 175, 176, 177, 139, 178,
       179, 180, 181,  68, 182, 183, 177, 184, 185, 186, 187, 188, 117,
       189, 190,  95, 191, 192, 193, 194, 116, 195,   6, 133,  91, 196,
       197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
       210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
       223, 224, 225, 226, 227, 228, 229, 230,   6, 225, 231, 232, 233,
       234, 235, 236, 237, 238, 115, 239, 228,  47, 240, 241, 242, 243,
       244, 245, 246, 247, 248, 249, 250])

In [12]:
y = revcorpus[revcorpus['type']=="train"]['sentiment']
#y = traindf['sentiment']

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten,CuDNNLSTM, CuDNNGRU
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

embed_size = 150
model = Sequential()
model.add(Embedding(input_dim=wordvec_model.wv.vectors.shape[0],output_dim=wordvec_model.wv.vectors.shape[1],weights=[wordvec_model.wv.vectors], input_length=embed_size,trainable=True))
#model.add(Embedding(max_features,embed_size))
model.add(Bidirectional(CuDNNLSTM(128,return_sequences = True)))
model.add(GlobalMaxPool1D())
#model.add(Dropout(0.1))
#model.add(Dense(20,activation='relu'))
#model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.summary()



W1008 22:19:45.498079 22240 deprecation_wrapper.py:119] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1008 22:19:45.521021 22240 deprecation_wrapper.py:119] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1008 22:19:45.526208 22240 deprecation_wrapper.py:119] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1008 22:19:45.542958 22240 deprecation_wrapper.py:119] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_sessio

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          24795900  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 150, 256)          440320    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 25,236,477
Trainable params: 25,236,477
Non-trainable params: 0
_________________________________________________________________


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, shuffle=False)
X_test

array([[  881,  1012,  3827, ...,     0,     0,     0],
       [ 4784,   949,  1439, ...,     0,     0,     0],
       [ 2990,   530,   136, ...,  1306,   160,    41],
       ...,
       [   79, 16673, 26434, ...,   194,   782, 13764],
       [  748,  1264,   805, ...,     0,     0,     0],
       [ 6619,  2264,   418, ...,     0,     0,     0]])

In [15]:
batch_size = 100
epochs = 20
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train,y_train, validation_data=(X_test,y_test),batch_size=batch_size, epochs=epochs)

W1008 22:19:49.101409 22240 deprecation_wrapper.py:119] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1008 22:19:49.153273 22240 deprecation.py:323] From C:\Users\Ellysah\Anaconda3\envs\keras-gpu\lib\site-packages\tensorflow\python\ops\nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 39929 samples, validate on 9983 samples
Epoch 1/20


UnknownError: 2 root error(s) found.
  (0) Unknown: Fail to find the dnn implementation.
	 [[{{node bidirectional_1/CudnnRNN}}]]
	 [[loss/mul/_123]]
  (1) Unknown: Fail to find the dnn implementation.
	 [[{{node bidirectional_1/CudnnRNN}}]]
0 successful operations.
0 derived errors ignored.

In [None]:
# validate the model on test dataset to determine generalization
loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print("\nTest accuracy: %.1f%%" % (100.0 * acc))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
imdb=pd.read_csv("imdb_master.csv",encoding="latin-1")
imdb = imdb[imdb.type=="test"]
imdb['label'] = imdb['label'].map({'neg':0,'pos':1})
imdb['processed_review'] = imdb.review.apply(lambda x: preprocess_text(x))  

print(imdb.info())
imdb.head(3)

In [None]:
X_test = pad_sequences(
    sequences=vectorize_data(imdb['processed_review'], vocab=wordvec_model.wv.vocab),
    maxlen=embed_size,
    padding='post')

In [None]:
y_test = imdb['label']

In [None]:
loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print("\nTest accuracy: %.1f%%" % (100.0 * acc))