In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-01-25 18:30:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-01-25 18:30:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-01-25 18:30:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.layers import GRU
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import html
import unicodedata
import nltk
import zipfile
import os
import pickle

In [3]:
nltk.data.path.append('/kaggle/working/nltk_data')

nltk.download('wordnet', download_dir='/kaggle/working/nltk_data')
nltk.download('omw-1.4', download_dir='/kaggle/working/nltk_data') 
nltk.download('punkt', download_dir='/kaggle/working/nltk_data')  
nltk.download('stopwords', download_dir='/kaggle/working/nltk_data')


[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data] Downloading package punkt to /kaggle/working/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /kaggle/working/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
wordnet_path = '/kaggle/working/nltk_data/corpora/wordnet.zip'

with zipfile.ZipFile(wordnet_path, 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/nltk_data/corpora/')


In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [6]:
def remove_special_chars(text):
    text = re.sub(r'  +', ' ', html.unescape(text.lower()))
    return text

In [7]:
def remove_non_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


In [8]:
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)


In [9]:
def replace_numbers(text):
    return re.sub(r'\d+', '', text)


In [10]:
def text2words(text):
    return word_tokenize(text)


In [11]:
def remove_stopwords(words):
    return [word for word in words if word not in stop_words]


In [12]:
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]


In [13]:
def normalize_text(text):
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = remove_punctuation(text)
    text = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words)
    words = lemmatize_words(words)
    return ' '.join(words)

In [14]:
raw_data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
X = raw_data['review']
raw_data['label'] = raw_data['sentiment'].map({'positive': 1, 'negative': 0})
y = raw_data['label']

In [15]:
proc_X = X.apply(normalize_text)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(proc_X, y, test_size=0.2, random_state=42)


In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
max_length = 100

In [18]:
encoded_train = tokenizer.texts_to_sequences(X_train)
padded_train = pad_sequences(encoded_train, maxlen=max_length, padding='post')

encoded_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(encoded_test, maxlen=max_length, padding='post')

In [19]:
embedding_index = {}
with open('glove.6B.100d.txt', mode='rt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [20]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [21]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    Bidirectional(LSTM(100, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(100)),
    Dropout(0.3),
    
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])



In [22]:
model.summary()

In [23]:
model.fit(padded_train, y_train, epochs=5, batch_size=32, verbose=1)


Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 29ms/step - accuracy: 0.7318 - loss: 0.5332
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 29ms/step - accuracy: 0.8281 - loss: 0.3917
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 29ms/step - accuracy: 0.8534 - loss: 0.3449
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 29ms/step - accuracy: 0.8692 - loss: 0.3110
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 28ms/step - accuracy: 0.8802 - loss: 0.2870


<keras.src.callbacks.history.History at 0x77fd97fad090>

In [24]:
loss, accuracy = model.evaluate(padded_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.8711 - loss: 0.3028
Test Accuracy: 87.46%


In [25]:
model.save('model.keras')

In [27]:
with open("tokenizer.pickle", "wb") as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
