In [None]:
# 1. IMPORTS
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# 2. DOWNLOAD NLTK STUFF
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 3. LOAD DATA
df = pd.read_csv('Balanced_dataset.csv')

# 4. TEXT PREPROCESSING
df['lowercased'] = df['Text'].str.lower()  # Convert text to lowercase
df['clean'] = df['lowercased'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove non-alphabetic characters
df['token'] = df['clean'].apply(lambda x: x.split())  # Tokenize text into words

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['token'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize words
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join tokens back into a string
df['final_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))

# 5. LABEL ENCODING (-1, 0, 1 → 0, 1, 2)
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['Score'])

# 6. TOKENIZE TEXT
texts = df['final_text'].values
labels = df['encoded_label'].values

# Tokenizer to vectorize text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform input length
max_len = 100
padded = pad_sequences(sequences, maxlen=max_len, padding='post')

# 7. SPLIT DATA INTO TRAINING AND TEST SETS
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# 8. BUILD ENHANCED LSTM CLASSIFICATION MODEL
model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=10000, output_dim=128, input_length=max_len))

# Add Bidirectional LSTM layer
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Bidirectional LSTM with more units

# Add another LSTM layer
model.add(LSTM(64, return_sequences=False))

# Dropout for regularization
model.add(Dropout(0.5))

# Fully connected layer
model.add(Dense(32, activation='relu'))

# Output layer with 3 classes
model.add(Dense(3, activation='softmax'))  # 3 classes (negative, neutral, positive)

# Compile the model
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer=Adam(learning_rate=0.0005),
              metrics=['accuracy'])

# 9. TRAIN THE MODEL (Changed epochs to 20)
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# 10. EVALUATE THE MODEL
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Print classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_.astype(str)))

# Calculate accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\nEnhanced LSTM Classification Accuracy: {acc:.4f}")

# 11. PLOT TRAINING HISTORY
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Enhanced LSTM Classification Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Epoch 1/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 329ms/step - accuracy: 0.3416 - loss: 1.0980 - val_accuracy: 0.3630 - val_loss: 1.0915
Epoch 2/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m397s[0m 293ms/step - accuracy: 0.3655 - loss: 1.0823 - val_accuracy: 0.3963 - val_loss: 1.0665
Epoch 3/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m452s[0m 301ms/step - accuracy: 0.4640 - loss: 1.0171 - val_accuracy: 0.6371 - val_loss: 0.8289
Epoch 4/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m429s[0m 322ms/step - accuracy: 0.6885 - loss: 0.7284 - val_accuracy: 0.6952 - val_loss: 0.7101
Epoch 5/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 292ms/step - accuracy: 0.7630 - loss: 0.5826 - val_accuracy: 0.7038 - val_loss: 0.7118
Epoch 6/20
[1m1332/1332[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 264ms/step - accuracy: 0.8138 - loss: 0.4822 - val_accuracy: 0.7084 - val_loss: