In [None]:
!pip install numpy pandas scikit-learn tensorflow



In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
!pip install kagglehub pandas



In [None]:
import kagglehub

# Download the Sentiment140 dataset
path = kagglehub.dataset_download("kazanova/sentiment140")
print("Downloaded to:", path)

Downloaded to: /kaggle/input/sentiment140


In [None]:
import pandas as pd
import os

# The dataset comes as a CSV file named 'training.1600000.processed.noemoticon.csv'
csv_file = os.path.join(path, "training.1600000.processed.noemoticon.csv")

# The dataset has no header, so manually add column names
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.read_csv(csv_file, encoding='latin-1', names=columns)

In [None]:
# Keep only the 'text' and 'target' columns
df = df[['text', 'target']]

# Convert target: 0 = negative, 4 = positive → convert to 0 and 1
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)
df = df.sample(10000)  # optional: sample for faster training

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower().strip()

df['clean_text'] = df['text'].apply(clean_text)

In [None]:
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded = pad_sequences(sequences, maxlen=20, padding='post')

labels = np.array(df['target'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

In [None]:
model = Sequential([
    Embedding(input_dim=1000, output_dim=64, input_length=20),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=2)

Epoch 1/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 10ms/step - accuracy: 0.5807 - loss: 0.6654 - val_accuracy: 0.7165 - val_loss: 0.5605
Epoch 2/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 10ms/step - accuracy: 0.7635 - loss: 0.5202 - val_accuracy: 0.7215 - val_loss: 0.5423
Epoch 3/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 10ms/step - accuracy: 0.7933 - loss: 0.4619 - val_accuracy: 0.7200 - val_loss: 0.5524
Epoch 4/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 11ms/step - accuracy: 0.8029 - loss: 0.4342 - val_accuracy: 0.7235 - val_loss: 0.5530
Epoch 5/5
[1m4000/4000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 10ms/step - accuracy: 0.8295 - loss: 0.3817 - val_accuracy: 0.7235 - val_loss: 0.5762


<keras.src.callbacks.history.History at 0x7ae14a12ca90>

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"Precision: {precision_score(y_test, y_pred):.2f}")
print(f"Recall: {recall_score(y_test, y_pred):.2f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")



[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
Accuracy: 0.72
Precision: 0.71
Recall: 0.73
F1 Score: 0.72


In [None]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    pad = pad_sequences(seq, maxlen=20, padding='post')
    pred = model.predict(pad)[0][0]
    return "Positive" if pred >= 0.5 else "Negative"

# Test
print(predict_sentiment("I really love this app!"))
print(predict_sentiment("This is terrible."))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
Negative
