In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMDB Sentiment Analysis using CNN

## Importing libraries

In [2]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Reading the dataset

In [3]:
import pandas as pd

file_path = '/content/drive/My Drive/movie_reviews.csv'
data = pd.read_csv(file_path)


# Display the first few rows of the DataFrame
print(data.head())


   Unnamed: 0                                             review  sentiment
0           0  i went and saw this movie last night after bei...          1
1           1  actor turned director bill paxton follows up h...          1
2           2  as a recreational golfer with some knowledge o...          1
3           3  i saw this film in a sneak preview and it is d...          1
4           4  bill paxton has taken the true story of the 19...          1


In [None]:
# shape of the data
data.shape

(50000, 3)

## Text Preprocessing

In [4]:
import string
# removing the html tags
def clean_html(text):
    clean=re.compile('<.*?>')
    cleantext=re.sub(clean,'',text)
    return cleantext

# first round of cleaning
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned_html=lambda x:clean_html(x)
cleaned1=lambda x:clean_text1(x)
cleaned2=lambda x:clean_text2(x)

data['review']=pd.DataFrame(data.review.apply(cleaned_html))
data['review']=pd.DataFrame(data.review.apply(cleaned1))
data['review']=pd.DataFrame(data.review.apply(cleaned2))

## Defining the model

In [5]:
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(data['review'].values)
X = tokenizer.texts_to_sequences(data['review'].values)
X = pad_sequences(X,maxlen=600)

In [6]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, SpatialDropout1D, Embedding, Dense, Dropout, LSTM

# Define the model
model_cnn = Sequential()
model_cnn.add(Embedding(5000, 128, input_length=X.shape[1]))  # Embedding layer
model_cnn.add(SpatialDropout1D(0.4))  # Spatial Dropout for regularization
model_cnn.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))  # LSTM layer 
model_cnn.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))  # Convolutional layer
model_cnn.add(MaxPooling1D(pool_size=2))  # Pooling layer
model_cnn.add(Flatten())  # Flatten layer to convert 2D to 1D
model_cnn.add(Dense(256, activation='relu'))  # Fully connected layer
model_cnn.add(Dropout(0.5))  # Dropout for regularization
model_cnn.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation

# Compile the model
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model_cnn.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 600, 128)          640000    
                                                                 
 spatial_dropout1d (Spatial  (None, 600, 128)          0         
 Dropout1D)                                                      
                                                                 
 conv1d (Conv1D)             (None, 600, 64)           24640     
                                                                 
 max_pooling1d (MaxPooling1  (None, 300, 64)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 19200)             0         
                                                                 
 dense (Dense)               (None, 256)               4

## Split the dataset

In [7]:
Y=pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = tts(X,Y, test_size = 0.2, random_state = 42)

## Running the model

In [8]:
batch_size = 64
model_cnn.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, validation_data=(X_test,Y_test), verbose = True)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d946745f520>