In [5]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [4]:
import kagglehub

path = kagglehub.dataset_download("abhi8923shriv/sentiment-analysis-dataset")
print(path)

Using Colab cache for faster access to the 'sentiment-analysis-dataset' dataset.
/kaggle/input/sentiment-analysis-dataset


In [6]:
import chardet

with open(f"{path}/test.csv", "rb") as f:
    print(chardet.detect(f.read()))

{'encoding': 'Windows-1252', 'confidence': 0.7296690307328605, 'language': ''}


In [7]:
train = pd.read_csv(
    f"{path}/train.csv",
    encoding="Windows-1252"
)

test = pd.read_csv(
    f"{path}/test.csv",
    encoding="Windows-1252"
)

In [8]:
data = pd.concat([train, test], ignore_index=True)
data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797.0,27400.0,105.0
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265.0,470.0,164.0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [9]:
data = data[['text', 'sentiment']]
data.isna().sum()

Unnamed: 0,0
text,1282
sentiment,1281


In [10]:
data = data.dropna()

In [12]:
X = data['text'].astype('string')

label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y = data['sentiment'].map(label_map)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
from keras.layers import TextVectorization

max_tokens = 10000

Vectorizer = TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=25,
    output_mode='int',
    standardize='lower_and_strip_punctuation'
)

Vectorizer.adapt(X_train)

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Vectorizer,
    Embedding(max_tokens, 256),
    LSTM(128),
    Dense(3, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [19]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=6,
    batch_size=32
)

Epoch 1/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 79ms/step - accuracy: 0.5299 - loss: 0.9345 - val_accuracy: 0.7069 - val_loss: 0.6954
Epoch 2/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 80ms/step - accuracy: 0.7559 - loss: 0.6059 - val_accuracy: 0.7221 - val_loss: 0.6649
Epoch 3/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 79ms/step - accuracy: 0.8267 - loss: 0.4634 - val_accuracy: 0.7118 - val_loss: 0.7240
Epoch 4/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 79ms/step - accuracy: 0.8708 - loss: 0.3597 - val_accuracy: 0.7018 - val_loss: 0.7722
Epoch 5/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 82ms/step - accuracy: 0.9063 - loss: 0.2666 - val_accuracy: 0.6940 - val_loss: 0.8934
Epoch 6/6
[1m776/776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 79ms/step - accuracy: 0.9310 - loss: 0.1985 - val_accuracy: 0.6740 - val_loss: 1.0536


In [21]:
print(f"Training accuracy:\t{history.history['accuracy'][-1]*100:.2f} %")
print(f"Validation accuracy:\t{history.history['val_accuracy'][-1]*100:.2f} %")

Training accuracy:	92.17 %
Validation accuracy:	67.40 %
