## Sentiment Classification using LSTM

##### NLP Task for classifying sentiments in movie reviews sourced from IMDB

In [1]:
import pandas as pd

df = pd.read_csv('input/imdb_movrev.csv')

print(df.head(5))


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [2]:
print(df['sentiment'].unique())

['positive' 'negative' ' Jim Abrahams']


In [3]:
df = df[df['sentiment'] != ' Jim Abrahams']

print(df['sentiment'].unique())

['positive' 'negative']


In [4]:
df.dropna(subset=['sentiment'], inplace=True)

#### Convert Sentiment Label to Binary Encoding

In [5]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
import re
import string

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['review'] = df['review'].apply(clean_text)

df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

df['review'] = df['review'].apply(remove_stopwords)

df.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['review']).toarray()
y = df['sentiment'].values

In [10]:
y = df['sentiment'].values


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [14]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32,
                    validation_data=(X_test, y_test))


Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 52ms/step - accuracy: 0.8121 - loss: 0.4048 - val_accuracy: 0.8900 - val_loss: 0.2649
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 58ms/step - accuracy: 0.9113 - loss: 0.2218 - val_accuracy: 0.8863 - val_loss: 0.2728
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 51ms/step - accuracy: 0.9413 - loss: 0.1595 - val_accuracy: 0.8819 - val_loss: 0.3169
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 62ms/step - accuracy: 0.9699 - loss: 0.0956 - val_accuracy: 0.8818 - val_loss: 0.3626
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 52ms/step - accuracy: 0.9854 - loss: 0.0523 - val_accuracy: 0.8809 - val_loss: 0.4455
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 53ms/step - accuracy: 0.9908 - loss: 0.0302 - val_accuracy: 0.8792 - val_loss: 0.5157
Epoc

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype("int32")

print("FeedForward Neural Network Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))



[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
FeedForward Neural Network Results
Accuracy: 0.8795

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88      5040
           1       0.88      0.88      0.88      4960

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000


Confusion Matrix:
 [[4443  597]
 [ 608 4352]]
