In [34]:
import os
import pandas as pd
import kagglehub
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# **download and read the data set**

In [35]:
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")
csv_file = os.path.join(path, "twitter_training.csv")
df = pd.read_csv(csv_file, delimiter=",", names=["ID", "game", "sentiment", "text"], encoding="utf-8")

#**Convert Sentiment labels to numbers and keep only Sentiment & Text**


In [36]:
df = df[["sentiment", "text"]].dropna()
label_mapping = {"Positive": 2, "Neutral": 1, "Negative": 0}
df["sentiment"] = df["sentiment"].map(label_mapping).fillna(1).astype(int)

# Tokenization & Padding

In [37]:
VOCAB_SIZE = 5000
MAX_LENGTH = 50

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text"])
sequences = tokenizer.texts_to_sequences(df["text"])
padded_sequences = pad_sequences(sequences, maxlen=MAX_LENGTH, padding="post")

# Split Data into Train & Test

In [38]:
y = np.array(df["sentiment"])
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

# Model Definition and Train

In [None]:
model = Sequential([
    Embedding(VOCAB_SIZE , 128),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])


model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 77ms/step - accuracy: 0.4270 - loss: 1.0762 - val_accuracy: 0.5617 - val_loss: 0.9382
Epoch 2/5
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 74ms/step - accuracy: 0.6253 - loss: 0.8335 - val_accuracy: 0.7194 - val_loss: 0.6666
Epoch 3/5
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 74ms/step - accuracy: 0.7741 - loss: 0.5609 - val_accuracy: 0.7633 - val_loss: 0.5889
Epoch 4/5
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 74ms/step - accuracy: 0.8370 - loss: 0.4226 - val_accuracy: 0.7996 - val_loss: 0.5198
Epoch 5/5
[1m1480/1480[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.8739 - loss: 0.3280

# Evaluate Model on Test Data

In [40]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step


#Calculate Accuracy

In [41]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 81.59%
