<a href="https://www.kaggle.com/code/kenza1996/imdb-sentiment-analysis?scriptVersionId=206612840" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>


# My NLP project  using  1D  CNN

In [1]:
#import libraries
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models,layers
from tensorflow.keras.layers import Embedding, LSTM, Dense, Activation, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers

In [2]:
#load the DataFrame
data=pd.read_csv("/kaggle/input/imdb-reviews/IMDB Dataset.csv")
data.head(4)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative


In [3]:
# access the column labels of my DataFrame
data.columns

Index(['review', 'sentiment'], dtype='object')

In [4]:
## Separating features (X) and labels (y)
X = data['review'] 
X

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [5]:
## Separating features (X) and labels (y)
Y=data['sentiment']
Y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [6]:
# Converting categorical sentiment labels to numerical values for model training
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative':0})
Y = data.sentiment

In [7]:
# Splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [8]:
# Tokenizing the text data to convert words into numerical sequences
tokenizer = Tokenizer(num_words=1000,
filters='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n',lower=True, split=" ")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [9]:
# Padding sequences to ensure uniform length for model input
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=100)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=100)

In [10]:
# Building a convolutional neural network (CNN) model for text classification
model = models.Sequential()
model.add(layers.Embedding(len(tokenizer.index_word), 8, input_length=100))# Adding an embedding layer to convert numerical sequences into dense vectors
model.add(layers.Convolution1D(16,4,activation='relu',input_shape=(100, 1)))
model.add(layers.AveragePooling1D())
model.add(layers.Convolution1D(32,4,activation='relu'))
model.add(layers.AveragePooling1D())
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 8)            898456    
                                                                 
 conv1d (Conv1D)             (None, 97, 16)            528       
                                                                 
 average_pooling1d (Average  (None, 48, 16)            0         
 Pooling1D)                                                      
                                                                 
 conv1d_1 (Conv1D)           (None, 45, 32)            2080      
                                                                 
 average_pooling1d_1 (Avera  (None, 22, 32)            0         
 gePooling1D)                                                    
                                                                 
 flatten (Flatten)           (None, 704)               0

In [11]:
# Compiling the CNN model for text classification
model.compile(optimizer='adam',loss=tf.keras.losses.BinaryCrossentropy(),
                  metrics=[tf.keras.metrics.BinaryAccuracy()])

In [12]:
# Training the CNN model for text classification
model.fit(x=X_train_seq_trunc,y=Y_train,batch_size=128,
              epochs=15,validation_data=(X_test_seq_trunc,Y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7d18f89f2e60>

# RNN LSTM

In [13]:
# Building a convolutional neural network (CNN) model for text classification
model2 = models.Sequential()
model2.add(layers.Embedding(len(tokenizer.index_word), 8, input_length=100))# Adding an embedding layer to convert numerical sequences into dense vectors
model2.add(layers.LSTM(64))
model2.add(layers.Dense(256))
model2.add(layers.Activation('relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(1, name='out_layer'))
model2.add(layers.Activation('sigmoid'))
model2.compile(loss = 'binary_crossentropy', optimizer='adam',\
               metrics = ['accuracy'])
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 8)            898456    
                                                                 
 lstm (LSTM)                 (None, 64)                18688     
                                                                 
 dense_1 (Dense)             (None, 256)               16640     
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257       
                                                                 
 activation_1 (Activation)   (None, 1)                

In [14]:
# Training the CNN model for text classification
model2.fit(x=X_train_seq_trunc,y=Y_train,batch_size=30,
              epochs=15,validation_data=(X_test_seq_trunc,Y_test))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x7d18d37cd240>

# pre-trained GloVe word+RNN LSTM

In [15]:
# Load pre-trained GloVe word embeddings
embedding_dim = 100  # Set this based on the dimensions of the GloVe file you downloaded
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

glove_file = "/kaggle/input/glove-6b-100d/glove.6B.100d.txt"  # Replace with the actual path to your GloVe file
with open(glove_file, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in tokenizer.word_index:
            embedding_matrix[tokenizer.word_index[word]] = np.array(values[1:], dtype="float32")

In [16]:
# Building a convolutional neural network (CNN) model for text classification with pre-trained word embeddings
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding_dim, weights=[embedding_matrix],
                    input_length=100, trainable=False))  # Set trainable to True if you want to fine-tune embeddings
model.add(LSTM(64, kernel_regularizer=regularizers.l2(0.01)))  # Adjust regularization as needed
model.add(Dense(256))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1, name='out_layer'))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 100)          11230800  
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 dense_2 (Dense)             (None, 256)               16640     
                                                                 
 activation_2 (Activation)   (None, 256)               0         
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257       
                                                                 
 activation_3 (Activation)   (None, 1)                

In [17]:
# Training the CNN model for text classification
model.fit(x=X_train_seq_trunc, y=Y_train, batch_size=128, epochs=30, validation_data=(X_test_seq_trunc, Y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7d18d3447ee0>