In [24]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model

In [25]:
data = pd.read_csv('imdb_top_1000.csv')

print(data)



                                           Poster_Link  \
0    https://m.media-amazon.com/images/M/MV5BMDFkYT...   
1    https://m.media-amazon.com/images/M/MV5BM2MyNj...   
2    https://m.media-amazon.com/images/M/MV5BMTMxNT...   
3    https://m.media-amazon.com/images/M/MV5BMWMwMG...   
4    https://m.media-amazon.com/images/M/MV5BMWU4N2...   
..                                                 ...   
995  https://m.media-amazon.com/images/M/MV5BNGEwMT...   
996  https://m.media-amazon.com/images/M/MV5BODk3Yj...   
997  https://m.media-amazon.com/images/M/MV5BM2U3Yz...   
998  https://m.media-amazon.com/images/M/MV5BZTBmMj...   
999  https://m.media-amazon.com/images/M/MV5BMTY5OD...   

                 Series_Title Released_Year Certificate  Runtime  \
0    The Shawshank Redemption          1994           A  142 min   
1               The Godfather          1972           A  175 min   
2             The Dark Knight          2008          UA  152 min   
3      The Godfather: Part II  

In [26]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(stop_words)


{"shan't", "weren't", 'ours', "you'd", 'himself', 'doing', "they're", 'when', 'mustn', 'll', 'of', 'very', 'whom', "didn't", 'should', 'more', 'here', 'how', 'herself', 'are', 'have', 'as', 'most', 'which', 'after', 'or', 'wouldn', 'down', 'in', 'if', 'ain', "needn't", 'above', 'do', 'does', 'i', "it's", 'itself', "i've", 'me', 'shouldn', 'ma', "you'll", 's', 'what', 'm', 'both', 'same', 'our', "won't", 'y', 'was', 'myself', "they'll", 'theirs', 'you', 'such', 'my', "that'll", "it'll", 'ourselves', 'own', 'shan', 'so', "he's", 'having', 'and', 'did', 'why', "don't", 'all', 'about', 'other', 'will', "you're", 'below', 'them', 'won', 'then', "i'd", 'can', "hadn't", 'weren', 'each', 'up', 'under', 'on', 'too', 'only', "should've", "we've", "aren't", 'the', 'wasn', 'for', "hasn't", 'over', 'be', 'mightn', 'than', 'during', 'been', 'not', "haven't", 'while', 'further', 'against', 'some', 'were', 'his', 'd', 'themselves', 'to', 'its', 'needn', 'again', 'there', 'him', 'just', 'don', 'these',

In [28]:
english_stops = set(stopwords.words('english'))


In [29]:
def load_dataset():
    df = pd.read_csv('imdb_top_1000.csv')
    x_data = df['Overview']       # Reviews/Input
    y_data = df['IMDB_Rating']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Overview')
print(x_data, '\n')
print('IMDB_Rating')
print(y_data)


Overview
0      [two, imprisoned, men, bond, number, years, fi...
1      [an, organized, crime, dynasty, aging, patriar...
2      [when, menace, known, joker, wreaks, havoc, ch...
3      [the, early, life, career, vito, corleone, new...
4      [a, jury, holdout, attempts, prevent, miscarri...
                             ...                        
995    [a, young, new, york, socialite, becomes, inte...
996    [sprawling, epic, covering, life, texas, cattl...
997    [in, hawaii, private, cruelly, punished, boxin...
998    [several, survivors, torpedoed, merchant, ship...
999    [a, man, london, tries, help, counter, espiona...
Name: Overview, Length: 1000, dtype: object 

IMDB_Rating
0      9.3
1      9.2
2      9.0
3      9.0
4      9.0
      ... 
995    7.6
996    7.6
997    7.6
998    7.6
999    7.6
Name: IMDB_Rating, Length: 1000, dtype: float64


In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)


Train Set
177    [after, family, murdered, notorious, ruthless,...
291    [in, fear, violence, escalate, people, algiers...
269    [travis, henderson, aimless, drifter, missing,...
929    [at, turning, point, life, former, tennis, pro...
474    [when, louis, bloom, con, man, desperate, work...
                             ...                        
639    [sixteen, year, old, lilja, friend, young, boy...
748    [as, harvard, student, mark, zuckerberg, creat...
751    [james, bond, loyalty, m, tested, past, comes,...
358    [a, precocious, outspoken, iranian, girl, grow...
470    [a, national, manhunt, ordered, rebellious, ki...
Name: Overview, Length: 800, dtype: object 

450    [two, men, attempt, prove, committed, perfect,...
440    [powerful, unethical, broadway, columnist, j, ...
367    [a, rat, cook, makes, unusual, alliance, young...
66     [in, distant, future, small, waste, collecting...
170    [when, two, girls, move, country, near, ailing...
                             ... 

In [31]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))



In [32]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[  20   11  384 ...  723  522    0]
 [   8  524  166 ...    0    0    0]
 [1817 1818 1819 ...   11    0    0]
 ...
 [ 346  164 1259 ...   18  200  210]
 [   1 1792 4781 ...    0    0    0]
 [   1  501 1805 ... 4784    0    0]] 

Encoded X Test
 [[   6   70  147 ...    0    0    0]
 [ 379 1247  607 ...    0    0    0]
 [   1  281 2510 ...    0    0    0]
 ...
 [1256 2706   30 ...    0    0    0]
 [   1  832   71 ...    0    0    0]
 [   1  657   28 ...    0    0    0]] 

Maximum review length:  16


In [33]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())




None


In [34]:

checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)



In [35]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])


Epoch 1/5
[1m6/7[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 25ms/step - accuracy: 0.0000e+00 - loss: 0.3642
Epoch 1: accuracy improved from -inf to 0.00000, saving model to models/LSTM.h5




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step - accuracy: 0.0000e+00 - loss: 0.2750
Epoch 2/5
[1m6/7[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 28ms/step - accuracy: 0.0000e+00 - loss: -2.3468
Epoch 2: accuracy did not improve from 0.00000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.0000e+00 - loss: -2.5830
Epoch 3/5
[1m5/7[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 30ms/step - accuracy: 0.0000e+00 - loss: -9.4972
Epoch 3: accuracy did not improve from 0.00000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0000e+00 - loss: -10.9678
Epoch 4/5
[1m6/7[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 27ms/step - accuracy: 0.0000e+00 - loss: -34.1413
Epoch 4: accuracy did not improve from 0.00000
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.

<keras.src.callbacks.history.History at 0x7cce143275f0>

In [36]:
import numpy as np

y_pred = model.predict(x_test, batch_size=128)
y_pred = np.argmax(y_pred, axis=1)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 251ms/step


In [37]:

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 0
Wrong Prediction: 200
Accuracy: 0.0


In [38]:
loaded_model = load_model('models/LSTM.h5')



In [39]:
review = str(input('Movie Review: '))

Movie Review: good movie


In [41]:
import re   # ← add this line

# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned:', review)


Cleaned: good movie


In [42]:
words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Filtered:  ['good movie']


In [43]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[503 203   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]


In [44]:
result = loaded_model.predict(tokenize_words)
print(result)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step
[[0.6833575]]


In [45]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

negative
