In [3]:
import pandas as pd
import numpy as np
import kagglehub
import re
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN,Bidirectional, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.utils import class_weight # Import class_weight from sklearn.utils

from nltk.corpus import stopwords
import nltk
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

print("Path to dataset files:", path)
files = os.listdir(path)


print("Files in dataset:", files)
data_file = os.path.join(path, "twitter_training.csv")
train_df = pd.read_csv(data_file)
data_file = os.path.join(path, "twitter_validation.csv")
val_df = pd.read_csv(data_file)

train_df.columns = ['Column1', 'Column2', 'Column3', 'Column4']
val_df.columns = ['Column1', 'Column2', 'Column3', 'Column4']



Path to dataset files: /kaggle/input/twitter-entity-sentiment-analysis
Files in dataset: ['twitter_validation.csv', 'twitter_training.csv']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_text(text):
    # Convert to string if not already
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip whitespace
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Clean the datasets
train_df['cleaned_text'] = train_df['Column4'].apply(clean_text)
val_df['cleaned_text'] = val_df['Column4'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
train_df['label_encoded'] = label_encoder.fit_transform(train_df['Column3'])
val_df['label_encoded'] = label_encoder.transform(val_df['Column3'])

# Tokenize and pad sequences
max_vocab_size = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['cleaned_text'])

X_train = tokenizer.texts_to_sequences(train_df['cleaned_text'])
X_train = pad_sequences(X_train, maxlen=max_sequence_length, padding='post')

y_train = train_df['label_encoded']

X_val = tokenizer.texts_to_sequences(val_df['cleaned_text'])
X_val = pad_sequences(X_val, maxlen=max_sequence_length, padding='post')

y_val = val_df['label_encoded']

# Handle class imbalance
class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# Define the model
model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_sequence_length),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(32)),
    Dense(128, activation='relu', kernel_regularizer='l2'),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])



model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Callbacks
lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
epochs = 10
batch_size = 32

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    class_weight=class_weights_dict,
    callbacks=[lr_reducer, early_stopping],
    verbose=1
)

# Evaluate the model
y_pred = np.argmax(model.predict(X_val), axis=-1)

precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))




Epoch 1/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 29ms/step - accuracy: 0.5111 - loss: 1.2443 - val_accuracy: 0.8378 - val_loss: 0.5246 - learning_rate: 0.0010
Epoch 2/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 28ms/step - accuracy: 0.7939 - loss: 0.5931 - val_accuracy: 0.8879 - val_loss: 0.3312 - learning_rate: 0.0010
Epoch 3/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 29ms/step - accuracy: 0.8529 - loss: 0.4125 - val_accuracy: 0.9239 - val_loss: 0.2394 - learning_rate: 0.0010
Epoch 4/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 29ms/step - accuracy: 0.8831 - loss: 0.3207 - val_accuracy: 0.9329 - val_loss: 0.2200 - learning_rate: 0.0010
Epoch 5/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 28ms/step - accuracy: 0.9018 - loss: 0.2690 - val_accuracy: 0.9419 - val_loss: 0.2086 - learning_rate: 0.0010
Epoch 6/10
[1m2334/2334[0m [32m━━━━━━━━━━━━━━━━