In [71]:
import os
import pandas as pd

In [72]:
def load_review(base_path):
    data=[]
    labels=[]
    for path in ['pos', 'neg']:
        fold_path = os.path.join(base_path, path)
        for file in os.listdir(fold_path):
            if file.endswith('.txt'):
                file_path = os.path.join(fold_path, file)
                with open(file_path, 'r', encoding="utf-8") as f:
                    review = f.read()
                    data.append(review)
                    labels.append(1 if path=='pos' else 0)
    return pd.DataFrame({
        "reviews" : data,
        "sentiments" : labels
    })

In [109]:
train_path = "aclImdb/train"
test_path = "aclImdb/test"

train_df = load_review(train_path)
test_df = load_review(test_path)

print(train_df.head())

print(train_df['sentiments'].value_counts())

                                             reviews  sentiments
0  Bromwell High is a cartoon comedy. It ran at t...           1
1  Homelessness (or Houselessness as George Carli...           1
2  Brilliant over-acting by Lesley Ann Warren. Be...           1
3  This is easily the most underrated film inn th...           1
4  This is not the typical Mel Brooks film. It wa...           1
sentiments
1    12500
0    12500
Name: count, dtype: int64


In [142]:
import re

def clean_text(t):
    text = t.lower()
    text = re.sub(r'<.*?>', '', text)      # remove HTML
    text = re.sub(r'[^a-z\s]', '', text)   # remove numbers & punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [111]:
train_df = clean_text(train_df)
test_df = clean_text(test_df)

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), min_df=5, max_df=0.8)

x_train = tfidf.fit_transform(train_df['reviews'])
x_test = tfidf.fit_transform(test_df['reviews'])

y_train = train_df['sentiments']
y_test = test_df['sentiments']

In [103]:
x_train.shape

(25000, 5000)

In [106]:
feature_names = tfidf.get_feature_names_out()
feature_names[:20]

array(['aaron', 'abandoned', 'abilities', 'ability', 'able', 'absence',
       'absent', 'absolute', 'absolutely', 'absurd', 'abuse', 'abysmal',
       'academy', 'academy award', 'accent', 'accents', 'accept',
       'acceptable', 'accepted', 'accident'], dtype=object)

In [108]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [112]:
x_train = train_df['reviews'].values
x_test = test_df['reviews'].values

y_train = train_df['sentiments'].values
y_test = test_df['sentiments'].values

In [114]:
x_train.shape

(25000,)

In [117]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")

tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [118]:
x_train_pad = pad_sequences(x_train_seq, maxlen=200, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=200, padding='post')

In [122]:
x_train_pad

array([[   1,  320,    7, ...,    0,    0,    0],
       [ 173, 1544,  126, ...,    6,  327,  373],
       [ 518, 3743,   32, ...,    0,    0,    0],
       ...,
       [  57,  129,   12, ...,   17,   99,   81],
       [1121,    3,  311, ...,    1, 1154,  831],
       [  10,    7,   28, ...,    0,    0,    0]])

In [120]:
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=200),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(
    loss = "binary_crossentropy",
    optimizer = 'adam',
    metrics = ['accuracy']
)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          2560000   
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [124]:
history = model.fit(
    x_train_pad,
    y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [126]:
loss, accuracy = model.evaluate(x_test_pad, y_test)



In [127]:
accuracy

0.5563600063323975

In [129]:
from tensorflow.keras.layers import Bidirectional

model = Sequential([
    Embedding(20000, 128, input_length=200),
    Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
    Dense(1, activation='sigmoid')
])

from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=1e-4)

model.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          2560000   
                                                                 
 bidirectional (Bidirectiona  (None, 256)              263168    
 l)                                                              
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 2,823,425
Trainable params: 2,823,425
Non-trainable params: 0
_________________________________________________________________


In [130]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)


In [132]:
history = model.fit(
    x_train_pad,
    y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop]
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [134]:
loss, accuracy = model.evaluate(x_test_pad, y_test)



In [135]:
accuracy

0.8592000007629395

In [181]:
def predict(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=200, padding='post')
    pred = model.predict(pad)[0][0]
    sentiment ="Positive" if pred > 0.5 else "Negative"
    return pred, sentiment

In [192]:
predict("The movie was awesome and best and the actor performance is average")



(0.53477067, 'Positive')

In [151]:
train_df.iloc[0,0]

'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my years in the teaching profession lead me to believe that bromwell highs satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector im here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isnt'

In [158]:
pred = model.predict(x_test_pad)

print(pred_probs[:10])


[[0.98475593]
 [0.9736065 ]
 [0.98890275]
 [0.98493165]
 [0.99577767]
 [0.9946323 ]
 [0.99248445]
 [0.7786323 ]
 [0.98659486]
 [0.9940926 ]]


In [183]:
model.save("artifacts/imdb_review_model.keras")

In [185]:
import pickle

with open("artifacts/tokenizer.pkl", 'wb') as f:
    pickle.dump(tokenizer, f)

In [191]:
config={"max_length" : 200,
    "max_words" : 20000
}

with open("artifacts/config.pkl", 'wb') as f:
    pickle.dump(config, f)