In [14]:
import numpy as np
import pandas as pd
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, LSTM
from keras.layers import Conv1D, GlobalMaxPooling1D

In [3]:
dataset = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
dataset['sentiment'] = dataset['sentiment'].replace({'positive': 1, 'negative': 0})
# sample only 5000 rows randomly (full dataset leads to memory issues)
dataset = dataset.sample(n=5000, random_state=42)
dataset

Unnamed: 0,review,sentiment
33553,I really liked this Summerslam due to the look...,1
9427,Not many television shows appeal to quite as m...,1
199,The film quickly gets to a major chase scene w...,0
12447,Jane Austen would definitely approve of this o...,1
39489,Expectations were somewhat high for me when I ...,0
...,...,...
39885,One of eastwood's best movies after he had sep...,1
17566,My blurred childhood memories have kept the ec...,0
16062,I love Zombie-Movies and I love amateur-produc...,0
48445,Chan is in New York and he gets involved with ...,1


In [4]:
dataset = dataset.to_numpy()
dataset[0]

array(["I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibiase, Steiner Brothers vs Heavenly Bodies, Shawn Michaels vs Curt Hening, this was the event where Shawn named his big monster of a body guard Diesel, IRS vs 1-2-3 Kid, Bret Hart first takes on Doink then takes on Jerry Lawler and stuff with the Harts and Lawler was always very interesting, then Ludvig Borga destroyed Marty Jannetty, Undertaker took on Giant Gonzalez in another terrible match, The Smoking Gunns and Tatanka took on Bam Bam Bigelow and the Headshrinkers, and Yokozuna defe

In [5]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin', binary=True, limit=1000)

In [6]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[0])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
                
            except KeyError:
                pass # no matching token in the Google w2v vocab
            
        vectorized_data.append(sample_vecs)
    
    return vectorized_data

In [7]:
def collect_expected(dataset):
    """Peel off the target values from the dataset"""
    expected = []
    for sample in dataset:
        expected.append(sample[1])
    return expected

In [8]:
#pass data into the functions
vectorized_data = tokenize_and_vectorize(dataset)

In [9]:
expected = collect_expected(dataset)

In [10]:
#train/test split
split_point = int(len(vectorized_data)*.8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]

x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [11]:
#truncate any review longer than 400 tokens
#pad the shorted smaples out to 400 tokens with null or 0

maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

def pad_trunc(data, maxlen):
    """
    for a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    
    #create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            #append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)         
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [12]:
#pass the train and test data to padder/truncator
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

In [13]:
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)

x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [16]:
num_neurons = 50
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 400, 50)           70200     
                                                                 
 dropout (Dropout)           (None, 400, 50)           0         
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 1)                 20001     
                                                                 
Total params: 90,201
Trainable params: 90,201
Non-trainable params: 0
_________________________________________________________________


In [20]:
#fit model
model.fit(x_train, y_train,
         batch_size = batch_size,
         epochs=epochs,
         validation_data=(x_test, y_test))


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x790784d0da50>

### Scores not impressive because of the dataset size, i only trained the model on around 4000 samples

In [27]:
#use the model to predict on a sample
sample_1 = """I hate that the dismal weather had me down for so long, when will it break! Ugh, when does happiness return? The sun is blinding and the puffy clouds are too thin. I can't wait for the weekend."""
vec_list = tokenize_and_vectorize([(sample_1, 1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [29]:
print(f"Sample's sentiment, 1- pos, 0- neg: {model.predict(test_vec)}")

Sample's sentiment, 1- pos, 0- neg: [[0.04737292]]
