<a href="https://colab.research.google.com/github/jc890/python/blob/master/Assignment07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# 1. import libraries
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.utils import class_weight

In [16]:
# 2. Load dataset
data = pd.read_csv("/content/judge-1377884607_tweet_product_company.csv", encoding='latin-1')


In [17]:
# 3. Clean tweets
def clean_tweet(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower().strip()
    return text

data['tweet_text'] = data['tweet_text'].fillna('')
data['clean_text'] = data['tweet_text'].apply(clean_tweet)

In [18]:
# 4. Encode target labels
label_enc = LabelEncoder()
data['label'] = label_enc.fit_transform(data['is_there_an_emotion_directed_at_a_brand_or_product'])
num_classes = len(label_enc.classes_)
y = to_categorical(data['label'], num_classes=num_classes)

In [19]:
# 5. Tokenizer and pad sequence
tokenizer = Tokenizer(num_words=15000, oov_token='<OOV>')
tokenizer.fit_on_texts(data['clean_text'])
X = tokenizer.texts_to_sequences(data['clean_text'])
X = pad_sequences(X, maxlen=50, padding='post', truncating='post')

In [20]:
# 6. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# 7. Compute class weights
labels = np.argmax(y_train, axis=1)
cw = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(labels), y=labels)
class_weights = dict(enumerate(cw))

In [22]:
# 8. Build the model
model = Sequential([
    Embedding(input_dim=15000, output_dim=64, input_length=50),
    Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4)),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [23]:
# 9. Callbacks
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1)


In [24]:
# 10. Train the model
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 263ms/step - accuracy: 0.2602 - loss: 1.4742 - val_accuracy: 0.3643 - val_loss: 1.3301 - learning_rate: 0.0010
Epoch 2/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 265ms/step - accuracy: 0.4403 - loss: 1.2760 - val_accuracy: 0.5058 - val_loss: 1.2014 - learning_rate: 0.0010
Epoch 3/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 258ms/step - accuracy: 0.5231 - loss: 1.1251 - val_accuracy: 0.4784 - val_loss: 1.1634 - learning_rate: 0.0010
Epoch 4/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 245ms/step - accuracy: 0.6251 - loss: 0.8906 - val_accuracy: 0.5746 - val_loss: 1.0297 - learning_rate: 0.0010
Epoch 5/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 261ms/step - accuracy: 0.6977 - loss: 0.7458 - val_accuracy: 0.5918 - val_loss: 1.0118 - learning_rate: 0.0010
Epoch 6/20
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [25]:

# 11. Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.6326 - loss: 1.2836
Test Accuracy: 0.6168


In [26]:


# 12. Test some tweets
test_tweets = [
    "Apple’s new MacBook is insanely good!",
    "Google Pixel camera still lags behind.",
    "Not sure about this update, seems weird.",
    "What even is this feature?"
]

cleaned = [clean_tweet(t) for t in test_tweets]
seqs = tokenizer.texts_to_sequences(cleaned)
pads = pad_sequences(seqs, maxlen=50, padding='post', truncating='post')
pred = model.predict(pads)

for t, p in zip(test_tweets, pred):
    print(f"{t} --> {label_enc.inverse_transform([np.argmax(p)])[0]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 843ms/step
Apple’s new MacBook is insanely good! --> Positive emotion
Google Pixel camera still lags behind. --> Positive emotion
Not sure about this update, seems weird. --> Negative emotion
What even is this feature? --> No emotion toward brand or product
