# Import Dataset and Preprocess

## Download the IMDB Movie Review Dataset

Link: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
reviews = pd.read_csv("./IMDB Dataset.csv.zip")

In [4]:
reviews.shape

(50000, 2)

In [5]:
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
reviews['sentiment_encoded'] = reviews['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [7]:
reviews.head()

Unnamed: 0,review,sentiment,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [8]:
# convert reviews and sentiments into numpy arrays
sentences = reviews['review'].to_numpy()
labels = reviews['sentiment_encoded'].to_numpy()

In [9]:
# make a train-test split of the data
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.25)
print("Training Data Input Shape: ", X_train.shape)
print("Training Data Output Shape: ", y_train.shape)
print("Testing Data Input Shape: ", X_test.shape)
print("Testing Data Output Shape: ", y_test.shape)

Training Data Input Shape:  (37500,)
Training Data Output Shape:  (37500,)
Testing Data Input Shape:  (12500,)
Testing Data Output Shape:  (12500,)


In [10]:
# create tokenizer
vocab_size = 10000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

In [11]:
# fit tokenizer on trainining sentences
tokenizer.fit_on_texts(X_train)
print("Number of Documents: ", tokenizer.document_count)
print("Number of Words: ", tokenizer.num_words)

Number of Documents:  37500
Number of Words:  10000


In [12]:
# visualize the count of each word (optional)
# tokenizer.word_counts

# Preprocessing Training Data

In [13]:
# create training sequences
train_sequences = tokenizer.texts_to_sequences(X_train)

In [14]:
# print(sentences[0], '\n', train_sequences[0])

In [15]:
# pad sequences
sequence_length = 200
train_padded = pad_sequences(train_sequences, maxlen=sequence_length, padding='post', truncating='post')

In [16]:
# print(train_padded[0])

# Preprocessing Test Data

In [17]:
# create test sequences
test_sequences = tokenizer.texts_to_sequences(X_test)

In [18]:
# padding test sequences
test_padded = pad_sequences(test_sequences, maxlen=sequence_length, padding='post', truncating='post')

# Save to Pickle

In [19]:
path_preprocessed = os.path.join(os.getcwd(), 'data-preprocessed')
if not os.path.exists(path_preprocessed):
    os.makedirs(path_preprocessed)
    
with open(os.path.join(path_preprocessed, 'tokenizer.pickle'), 'wb') as f:
    pickle.dump(tokenizer, f)
    
with open(os.path.join(path_preprocessed, 'train_padded.pickle'), 'wb') as f:
    pickle.dump(train_padded, f)
    
with open(os.path.join(path_preprocessed, 'y_train.pickle'), 'wb') as f:
    pickle.dump(y_train, f)
    
with open(os.path.join(path_preprocessed, 'test_padded.pickle'), 'wb') as f:
    pickle.dump(test_padded, f)
    
with open(os.path.join(path_preprocessed, 'y_test.pickle'), 'wb') as f:
    pickle.dump(y_test, f)