In [1]:
# Step 1: Install Required Libraries
!pip install nltk
!pip install tensorflow
!pip install scikit-learn



In [47]:
# Imports
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.layers import Bidirectional

# 1. Load and prepare data
url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
df = pd.read_csv(url)

# Select and rename columns
df = df[['label', 'tweet']]
df.columns = ['sentiment', 'text']

# Step 1: Create balanced dataset of 3 classes
min_count = df['sentiment'].value_counts().min()

# Extract balanced positive and negative
negative_df = df[df['sentiment'] == 0].sample(min_count, random_state=42)
positive_df = df[df['sentiment'] == 1].sample(min_count, random_state=42)

# Create synthetic neutral class from leftovers
remaining_df = df.drop(negative_df.index).drop(positive_df.index)
neutral_df = remaining_df.sample(min_count, random_state=99).copy()
neutral_df['sentiment'] = 2  # label for neutral

# Combine and shuffle
df = pd.concat([negative_df, positive_df, neutral_df], ignore_index=True)
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Optional: map sentiment to text labels for understanding
sentiment_map_text = {0: 'negative', 1: 'positive', 2: 'neutral'}
df['sentiment_label'] = df['sentiment'].map(sentiment_map_text)

# 2. Clean text
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)

# 3. Split data
X = df['clean_text'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Tokenize and pad sequences
vocab_size = 10000
max_len = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post', truncating='post')

print("Shape of training data:", X_train_pad.shape)

from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Embedding, LayerNormalization

# 5. Build model with light improvements
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128),
    Bidirectional(LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)),
    LayerNormalization(),  # Helps stabilize learning
    Dense(64, activation='relu'),
    Dropout(0.4),  # Slightly increased
    Dense(3, activation='softmax')
])

model.build(input_shape=(None, max_len))
model.summary()

# 6. Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Shape of training data: (5380, 100)


In [48]:
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.callbacks import EarlyStopping

# Compute class weights to handle class imbalance
class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights_array))

# Early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.1,
    class_weight=class_weights,
    callbacks=[early_stop]
)


Epoch 1/10
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 369ms/step - accuracy: 0.3900 - loss: 1.1101 - val_accuracy: 0.5576 - val_loss: 0.8147
Epoch 2/10
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 350ms/step - accuracy: 0.6126 - loss: 0.6978 - val_accuracy: 0.5613 - val_loss: 0.8184
Epoch 3/10
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 351ms/step - accuracy: 0.7233 - loss: 0.5081 - val_accuracy: 0.6041 - val_loss: 0.9210


In [49]:
# 8. Evaluate model on test data
y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Unique labels in y_test:", np.unique(y_test))
print("Unique labels in y_pred:", np.unique(y_pred))

print("Test Accuracy:", accuracy_score(y_test, y_pred))

# Match label index to proper class names
print("\nClassification Report:\n", classification_report(
    y_test, y_pred,
    labels=[0, 1, 2],  # 0: negative, 1: positive, 2: neutral
    target_names=['Negative', 'Positive', 'Neutral']
))


[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step
Unique labels in y_test: [0 1 2]
Unique labels in y_pred: [0 1 2]
Test Accuracy: 0.5772659732540861

Classification Report:
               precision    recall  f1-score   support

    Negative       0.47      0.53      0.50       448
    Positive       0.83      0.78      0.80       449
     Neutral       0.46      0.43      0.44       449

    accuracy                           0.58      1346
   macro avg       0.58      0.58      0.58      1346
weighted avg       0.58      0.58      0.58      1346

