In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# IMDB Sentiment Analysis using CNN

## Importing libraries

In [13]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Reading the dataset

In [14]:
import pandas as pd

file_path = '/content/drive/My Drive/movie_reviews.csv'
data = pd.read_csv(file_path)


# Display the first few rows of the DataFrame
print(data.head())


   Unnamed: 0                                             review  sentiment
0           0  i went and saw this movie last night after bei...          1
1           1  actor turned director bill paxton follows up h...          1
2           2  as a recreational golfer with some knowledge o...          1
3           3  i saw this film in a sneak preview and it is d...          1
4           4  bill paxton has taken the true story of the 19...          1


In [None]:
# shape of the data
data.shape

(50000, 2)

## Text Preprocessing

In [15]:
import string
# removing the html tags
def clean_html(text):
    clean=re.compile('<.*?>')
    cleantext=re.sub(clean,'',text)
    return cleantext

# first round of cleaning
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned_html=lambda x:clean_html(x)
cleaned1=lambda x:clean_text1(x)
cleaned2=lambda x:clean_text2(x)

data['review']=pd.DataFrame(data.review.apply(cleaned_html))
data['review']=pd.DataFrame(data.review.apply(cleaned1))
data['review']=pd.DataFrame(data.review.apply(cleaned2))

## Defining the model

In [16]:
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(data['review'].values)
X = tokenizer.texts_to_sequences(data['review'].values)
X = pad_sequences(X,maxlen=600)

In [17]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, SpatialDropout1D, Embedding, Dense, Dropout

# Define the model
model_cnn = Sequential()
model_cnn.add(Embedding(5000, 128, input_length=X.shape[1]))  # Embedding layer
model_cnn.add(SpatialDropout1D(0.4))  # Spatial Dropout for regularization
model_cnn.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))  # Convolutional Layer 1
model_cnn.add(MaxPooling1D(pool_size=2))  # Pooling Layer 1
model_cnn.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))  # Convolutional Layer 2
model_cnn.add(MaxPooling1D(pool_size=2))  # Pooling Layer 2
model_cnn.add(Flatten())  # Flatten layer
model_cnn.add(Dense(256, activation='relu'))  # Fully Connected Layer
model_cnn.add(Dropout(0.5))  # Dropout for regularization
model_cnn.add(Dense(128, activation='relu'))  # Hidden Layer
model_cnn.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation for binary classification

# Compile the model
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model_cnn.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 600, 128)          640000    
                                                                 
 spatial_dropout1d_3 (Spati  (None, 600, 128)          0         
 alDropout1D)                                                    
                                                                 
 conv1d_6 (Conv1D)           (None, 600, 64)           24640     
                                                                 
 max_pooling1d_6 (MaxPoolin  (None, 300, 64)           0         
 g1D)                                                            
                                                                 
 flatten_3 (Flatten)         (None, 19200)             0         
                                                                 
 dense_9 (Dense)             (None, 256)              

## Split the dataset

In [18]:
Y=pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = tts(X,Y, test_size = 0.2, random_state = 42)

## Running the model

In [19]:
batch_size = 64
model_cnn.fit(X_train, Y_train, epochs = 10, batch_size=batch_size, validation_data=(X_test,Y_test), verbose = True)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78fc7dfa62f0>