In [1]:
# !pip install emot


In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

# Function to clean and preprocess text
def preprocess_text(text):
    # Convert to lower case
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # Replace URLs with a special token
    text = re.sub(r'http\S+', 'URL', text)
    # Replace user @ references with a special token
    text = re.sub(r'\@\w+', '@USER', text)
    # Replace hashtags by just words (remove # but keep the word)
    text = re.sub(r'#', '', text)
    # Replace numbers with a special token
    text = re.sub(r'\d+', 'NUMBER', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove punctuation from each token
    tokens = [t for t in tokens if t.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if not t in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    # Join all tokens back to a single string
    return ' '.join(tokens)


file_path = '/content/judge-1377884607_tweet_product_company - judge-1377884607_tweet_product_company.csv'

df = pd.read_csv(file_path)

# Rename columns for easier reference
df.rename(columns={'tweet_text': 'tweet', 'is_there_an_emotion_directed_at_a_brand_or_product': 'label'}, inplace=True)

# Apply preprocessing to the 'tweet' column
df['tweet'] = df['tweet'].apply(preprocess_text)

# Apply preprocessing to each tweet
df['tweet'] = df['tweet'].apply(preprocess_text)

print("Unique values in 'label':", df['label'].unique())
# Updated mapping to include all classes
class_mapping = {
    'Positive emotion': 0,
    'Negative emotion': 1,
    'No emotion toward brand or product': 2,  # Assuming this is the 'no_idea' class
    "I can't tell": 3  # Assuming this is the 'neutral' class
}

# Apply the mapping to the label column
df['label'] = df['label'].map(class_mapping)

# Verify the new distribution of mapped labels
print("New label distribution:\n", df['label'].value_counts())


Unique values in 'label': ['Negative emotion' 'Positive emotion'
 'No emotion toward brand or product' "I can't tell"]
New label distribution:
 2    5389
0    2978
1     570
3     156
Name: label, dtype: int64


In [5]:
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [7]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Assuming 'df' is your preprocessed DataFrame

# Tokenizer creation and fitting
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['tweet'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['tweet'])

# Padding sequences
max_sequence_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_sequence_length)

# Convert labels to one-hot encoding
y = to_categorical(df['label'].values)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model parameters
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
embedding_dim = 50  # You can choose different sizes for the embedding dimension
rnn_units = 64  # The number of units in the RNN layer
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SimpleRNN(units=rnn_units, return_sequences=True))  # Return sequences to stack another RNN layer
model.add(SimpleRNN(units=rnn_units))  # Additional RNN layer
# You can add more layers here if needed
model.add(Dense(8, activation='relu'))  # Additional Dense layer with ReLU activation
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]  # Add early stopping callback
)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Test Accuracy: 0.6030786037445068


In [None]:

from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from numpy import average
import numpy as np

X_train = np.expand_dims(X_train, axis=-1)  # Now shape is (num_samples, max_sequence_length, 1)
X_test = np.expand_dims(X_test, axis=-1)  # Now shape is (num_samples, max_sequence_length, 1)

# Define the LSTM model
def build_lstm_model(input_shape, output_units):
    lstm_input = Input(shape=input_shape)
    lstm_layer = LSTM(units=rnn_units, return_sequences=False)(lstm_input)
    dense_layer = Dense(8, activation='relu')(lstm_layer)
    output_layer = Dense(output_units, activation='softmax')(dense_layer)
    model = Model(inputs=lstm_input, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Build LSTM model
lstm_model = build_lstm_model((max_sequence_length, 1), y_train.shape[1])

# Train the LSTM model with early stopping
lstm_history = lstm_model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]
)

# Make predictions with both models
simple_rnn_predictions = model.predict(X_test)
lstm_predictions = lstm_model.predict(X_test)

# Average the predictions from both models
ensemble_predictions = average([simple_rnn_predictions, lstm_predictions], axis=0)

# Convert ensemble predictions to label indices
ensemble_label_indices = ensemble_predictions.argmax(axis=1)

# Convert true labels from one-hot encoding to label indices
true_label_indices = y_test.argmax(axis=1)




In [13]:
from sklearn.metrics import confusion_matrix, classification_report

# Evaluate ensemble model
ensemble_accuracy = (ensemble_label_indices == true_label_indices).mean()
print(f'Ensemble Test Accuracy: {ensemble_accuracy}')

# Generate a classification report
print(confusion_matrix(true_label_indices, ensemble_label_indices))
print(classification_report(true_label_indices, ensemble_label_indices))



Ensemble Test Accuracy: 0.6091258933479934
[[  59    0  530    0]
 [  15    0  100    0]
 [  34    0 1049    0]
 [   2    0   30    0]]
              precision    recall  f1-score   support

           0       0.54      0.10      0.17       589
           1       0.00      0.00      0.00       115
           2       0.61      0.97      0.75      1083
           3       0.00      0.00      0.00        32

    accuracy                           0.61      1819
   macro avg       0.29      0.27      0.23      1819
weighted avg       0.54      0.61      0.50      1819



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
