In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

In [2]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def clean_text(text):
  # Remove HTML tags
  text=re.sub(r'<.*?>', '', text)
  # Remove punctuation
  text=text.translate(str.maketrans('', '', string.punctuation))
  # Convert to lowercase
  text=text.lower()
  # Remove stopwords
  text=' '.join([word for word in text.split() if word not in stop_words])
  return text

In [4]:
!unzip '/content/drive/MyDrive/IMDB Dataset.csv (1).zip' -d '/content/drive/MyDrive/IMDB Dataset.csv (1)'

Archive:  /content/drive/MyDrive/IMDB Dataset.csv (1).zip
replace /content/drive/MyDrive/IMDB Dataset.csv (1)/IMDB Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: /content/drive/MyDrive/IMDB Dataset.csv (1)/IMDB Dataset.csv  


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv (1)/IMDB Dataset.csv')

In [7]:
# Clean the review texts
df['review']=df['review'].apply(clean_text)
# Encode labels: 'positive' -> 1, 'negative' -> 0
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
vocab_size = 10000  # Adjust based on your dataset
max_length = 100    # Adjust based on your dataset

# Initialize tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='')
tokenizer.fit_on_texts(df['review'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['review'])

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU,Dense,Dropout
model=Sequential([
    Embedding(input_dim=vocab_size,output_dim=64,input_length=max_length),
    GRU(64,return_sequences=False),
    Dropout(0.5),
    Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [12]:
from sklearn.model_selection import train_test_split

# Features and labels
X = padded_sequences
y = df['label'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X, y, epochs=5,  verbose=1, validation_split=0.2)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 75ms/step - accuracy: 0.5650 - loss: 0.6634 - val_accuracy: 0.8073 - val_loss: 0.4225
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 73ms/step - accuracy: 0.8645 - loss: 0.3338 - val_accuracy: 0.8720 - val_loss: 0.3039
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 72ms/step - accuracy: 0.9223 - loss: 0.2118 - val_accuracy: 0.8721 - val_loss: 0.3096
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 69ms/step - accuracy: 0.9486 - loss: 0.1517 - val_accuracy: 0.8653 - val_loss: 0.3477
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 68ms/step - accuracy: 0.9703 - loss: 0.0978 - val_accuracy: 0.8587 - val_loss: 0.4300


In [15]:
model.save('/content/drive/MyDrive/gru_model.keras')

In [17]:
import pickle

# Save tokenizer
with open('/content/drive/MyDrive/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)